def unfold_section(self, obj, level): test = lambda x: x["type"] == "header" and x["depth"] == level headings = list(filter(test, obj["content"])) contents = self.split_list(test, obj["content"]) # case 1: no underlying structure if not headings and len(contents) == 1: subsections = obj["content"] # case 2: no paragraph before first header elif len(headings) == len(contents): subsections = [{ "type": "section", "title": h["content"], "depth": h["depth"], "content": c } for h, c in zip(headings, contents)] # case 3: one paragraph before first header elif len(headings) == len(contents) - 1: subsections = (contents[0] + [{ "type": "section", "title": h["content"], "depth": h["depth"], "content": c } for h, c in zip(headings, contents[1:])]) # mismatch between headers and paragraphs else: message = "ill-formed structure in article" subsections = [{"type": "error", "message": message}] log_parser_error(message, obj, position=self.current_position) return merge(obj, {"content": self(subsections)})
def transform_chapter(self, obj): authors = defaultdict(int) for k, v in chain(*(x["authors"].items() for x in obj["children"])): authors[k] += v return merge(obj, {"authors": authors})
def transform_header(self, obj): check(obj, "content", -1, "type") == "template" check(obj, "content", -1, "name") == "anker" heading = text_rstrip(obj["content"][:-1]) anchor = obj["content"][-1]["params"]["1"] return merge(obj, {"content": heading, "anchor": anchor})
def transform_article(self, obj): if not next( filter(lambda x: x["type"] == "included_section", obj["content"]), None): return obj merged_content = chain( *(x["content"] if x["type"] == "included_section" else [x] for x in obj["content"])) return merge(obj, {"content": list(merged_content)})
def test_merge(self): self.assertEqual(merge(None, "a"), "a") self.assertListEqual(merge(None, [1, 2]), [1, 2]) self.assertIsNone(merge(None, None)) for obj1, obj2, output in [([1, 2], [3, 4], [1, 2, 3, 4]), ([], [3, 4], [3, 4]), ([1, 2], [], [1, 2]), (["a"], ["b"], ["a", "b"]), ({ "a": 1 }, { "b": 2 }, { "a": 1, "b": 2 }), ({ "a": 1 }, { "a": 2 }, { "a": 2 }), ({}, { "a": 2 }, { "a": 2 }), ({ "a": 2 }, {}, { "a": 2 })]: obj1_before = obj1.copy() obj2_before = obj2.copy() test_func = self.assertDictEqual if isinstance(obj1, dict) else \ self.assertListEqual test_func(merge(obj1, obj2), output) # obj1 and obj2 didn't change during execution of add_dict() test_func(obj1, obj1_before) test_func(obj2, obj2_before)
def transform_article(self, article): parser = ArticleContentParser(api=self.api, title=article["title"]) article_link = self.api._index_url + "?title=" + article[ "title"].replace(" ", "+") report_logger.info("== Parsing of Article [{} {}] ==".format( article_link, article["title"])) content = parser(self.api.get_content(article["title"])) authors = self.get_article_authors(article["title"]) return merge(article, {"content": content, "authors": authors})
def normalize(self, obj, mode): try: formula = self.api.normalize_formula(obj["formula"], mode) if mode == "tex": formula = remove_prefix(formula, "{\\begin{aligned}") formula = remove_suffix(formula, "\\end{aligned}}") except ValueError: message = "Wrong formatted formula" # TODO: current_section was not set for this class log_parser_error(message, obj) return {"type": "error", "message": message} return merge(obj, {"formula": formula})
def transform_template(self, obj): if obj["name"] in TEMPLATE_LIST_PARAMS: params = obj["params"].copy() for param_prefix in TEMPLATE_LIST_PARAMS[obj["name"]]: result = [] for n in count(1): try: result.append(params.pop(param_prefix + str(n))) except KeyError: break params[param_prefix + "_list"] = result return merge(obj, {"params": params}) else: raise NotInterested()
def change_inline(self, obj, i, n): if lookup(obj, "type") == "text": data = re.sub(r"\s+(?=\s)", "", obj["data"]) data = re.sub(r"\s", " ", data) if "\n" in data: print(repr(data)) if i == 0: data = data.lstrip() if i == n - 1: data = data.rstrip() if data: return merge(obj, {"data": data}) else: return None else: return self(obj)
def query(self, params, path_to_result): params["format"] = "json" params["action"] = "query" path_to_result = ["query"] + path_to_result result = None while True: api_result = self.req.get(self._api_url, params=params).json() if "error" in api_result: message = "Error while making API call." raise ConnectionError(api_result.get("info", message)) result = merge(result, query_path(api_result, path_to_result)) if "continue" in api_result: params.update(api_result["continue"]) else: return result
def transform_dict(self, obj): check(obj, "type").of(DEFAULT_VALUES) return merge(DEFAULT_VALUES[obj["type"]], super(NodeTransformation, self).act_on_dict(obj))
def transform_template(self, obj): for bname, tname, param_names in BOXSPEC: if obj["name"] == tname: params = { k: self(obj["params"].get(v, None)) for k, v in param_names.items() } return merge(params, {"type": bname}) if obj["name"] == "liste": if "liste" in obj["params"]: sublist = obj["params"]["liste"][0] assert sublist["type"] == "list" items = sublist["items"] ordered = sublist["ordered"] else: items = [{ "type": "listitem", "content": self(x) } for x in obj["params"]["item_list"]] ordered = obj["params"].get("type", "") == "ol" return { "type": "list", "items": items, "ordered": ordered, "spacing": obj["params"].get("abstand", None) } elif obj["name"] == "formel": formula = obj["params"].get("1", []) if len(formula) == 1 and \ lookup(formula, 0, "type") == "inlinemath": formula = formula[0]["formula"] if formula.startswith( "\\begin{align}") and formula.endswith( "\\end{align}"): formula = remove_prefix(formula, "\\begin{align}") formula = remove_suffix(formula, "\\end{align}") formula = "\\begin{align}" + formula + "\\end{align}" return {"type": "equation", "formula": formula} else: message = "Wrong formatted equation" details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)" log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} elif obj["name"] == "(!": return None elif obj["name"].startswith("#invoke:"): # Template is header or footer return None elif obj["name"] == "noprint": return None elif obj["name"] == "todo": message = "Todo-Message in MediaWiki code." details = "Check if this TODO shoud be completed for a book release." log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} else: message = "Parsing of template `{}`".format(obj["name"]) log_parser_error(message, obj, position=self.current_section) return { "type": "notimplemented", "target": obj, "message": message }