Пример #1
0
def fix_image_tables(root):
    img_tables = root.xpath(
        '//table[contains(@class, "short-table") and not(contains(@class, "infobox")) and .//a[contains(@class, "image")]]'
    )
    for table in img_tables:
        utils.remove_node_styles(table, "margin")
        utils.append_class(table, "image-table")
        max_widths = {}
        for row in table.xpath(".//tr"):
            for n, column in enumerate(row.xpath(".//td")):
                for img in column.xpath(".//img"):
                    width = utils.get_node_width(img, target_unit="px")
                    max_widths[n] = max(width, max_widths.get(n, 0))
        total_width = sum(max_widths.values())
        if total_width * config.px2pt > config.page_width_pt:
            utils.append_class(table, "wide-image-table")
            for row in table.xpath(".//tr"):
                for n, column in enumerate(row.xpath(".//td")):
                    _remove_inner_image_node_width(column, "image")
                    utils.remove_node_styles(column, ["padding-left", "padding", "margin"])
                    utils.add_node_style(
                        column, "width", "{}%".format(max_widths.get(n, 0) / total_width * 100)
                    )
        elif total_width > 0:
            for img in table.xpath(".//img"):
                _resize_image_node_width_to_pt(img)
Пример #2
0
def handle_tiny_table(node, width, height):
    """
    float small tables 
    - if they are followed by a sufficient amount of text
    """
    if node_is_floatable(node, width, height):
        utils.append_class(node, "pp_float_table")
Пример #3
0
def remove_img_style_size(root):
    """
    add class to img container and remove explicit width attributes
    """
    xpath_conditions = [
        'contains(@class,"thumb") ',
        'and not(contains(@class, "tmulti"))',
        'and not(contains(@class, "thumbinner"))',
        'and not(contains(@class, "thumbcaption"))',
        'and not(contains(@class, "thumbimage"))',
    ]
    result = root.xpath("//div[{}]".format(" ".join(xpath_conditions)))
    for img_container in result:
        if "map" in img_container.attrib.get("class", ""):
            continue
        thumbinner = img_container.xpath('.//*[contains(@class,"thumbinner")]')
        for node in thumbinner:
            utils.remove_node_styles(node, ["width", "height", "max-width"])
        if not img_container.xpath(".//img"):
            log.debug("No <img> found in {}".format(etree.tostring(img_container)))
            continue
        img = img_container.xpath(".//img")[0]
        width = utils.get_node_width(img, target_unit="pt")
        utils.remove_node_styles(img, ["width", "height"])
        cols = int(round(width / (column_width_pt * 4)))
        if cols > 3:
            cols = 3
        cols = cols * 4
        utils.append_class(img_container, "col-{}".format(cols))
        utils.remove_node_width(img_container)
        utils.remove_node_width(img)
Пример #4
0
def change_references_id_to_class(root):
    for node in root.xpath('//*[@id="References"]'):
        if node.tag == "h2":
            utils.append_class(node, "references")
        else:
            utils.append_class(node.getparent(), "references")
        del node.attrib["id"]
Пример #5
0
def move_caption(node):
    utils.append_class(node, "pp-table-caption")
    wrapper = E.div({"class": "pp-table"})
    try:
        node[0][0].text = node[0][0].text.replace(":", "")
        node[0].tail = ""
    except:
        print("Error at: " + etree.tostring(node))
    node_pos = node.getparent().index(node)
    nodelist = node.getparent().getchildren()
    indexpos = node_pos - 1
    while nodelist[indexpos].tag in ["p", "ul"]:
        if nodelist[indexpos].get("class") and "gallery" in nodelist[indexpos].get("class"):
            break
        else:
            indexpos -= 1
        # indexpos is the beef

    wrapper.append(node)
    if indexpos < 1:
        indexpos = 1
    nodelist[indexpos - 1].addnext(wrapper)
    for i in range(indexpos, node_pos):
        wrapper.append(nodelist[i])

    # add second caption to tables
    if wrapper[1].tag == "table":
        node2 = deepcopy(node)
        node2.tag = "caption"
        utils.append_class(node2, "following")
        wrapper[1].append(node2)
Пример #6
0
def remove_style_sizes(root):
    for table in root.xpath("//table[@style]"):
        utils.remove_node_styles(table, ["width", "height"])
        utils.remove_node_width(table)
        if table.attrib.get("border"):
            del table.attrib["border"]
            utils.append_class(table, "pp_border_table")
Пример #7
0
def tag_local_images(root, collection):
    input_image_path = collection.get("image_path")
    if input_image_path and os.path.exists(input_image_path):
        input_images = os.listdir(input_image_path)
        for img in root.xpath("//img"):
            file_name = str.split(img.get("_src", ""), "/")[-1]
            if file_name in input_images:
                utils.append_class(img, "local-image")
Пример #8
0
def handle_col_floats(root):
    for node in root.xpath('//*[contains(@class, "infobox")]'):
        w, h = node_size(node)
        if h < config.min_float_height or (h < 2 * config.min_float_height
                                           and node_is_floatable(node, w, h)):
            utils.append_class(node, "pp_no_float")
        elif "pp_float_table" in node.get("class", ""):
            utils.remove_class(node, "pp_float_table")
Пример #9
0
def handle_span_all(node, width, height, two_col_max_size, debug):
    """
    limit node width to max
    ? not sure about this ?
    """
    if width > two_col_max_size:
        if debug:
            utils.add_node_style(node, "background-color", "red")
        utils.append_class(node, "pp_singlecol")
Пример #10
0
def improve_table_breaks(root):
    # https://de.wikipedia.org/wiki/Suzy_Batkovic-Brown
    for table in root.xpath(
            '//table[not(ancestor::table) and not(contains(@class, "infobox"))]'
    ):
        rows = table.xpath("./tr|./thead/tr|./tbody/tr")
        for idx in range(min(len(rows), config.table_no_break_max_lines)):
            utils.append_class(rows[idx], "pp_nobreak_after")
            utils.append_class(rows[-1 * (idx + 1)], "pp_nobreak_before")
Пример #11
0
def scale_inline(root):
    max_inline_width = 50
    max_inline_height = 50
    for img in root.xpath(
        "//img[@width<{max_inline_width}][@height<{max_inline_height}]".format(**locals())
    ):
        w, h = get_img_size(img)
        img.set("width", str(w / 2))
        img.set("height", str(h / 2))
        utils.append_class(img, "inline")
Пример #12
0
def map_classes(article):
    class_map = get_map(_class_map, article.language)
    if not class_map:
        return
    for node in article.dom.xpath("//*[@class]"):
        class_list = node.get("class").split(" ")
        for cls in class_map:
            if cls in class_list:
                utils.remove_class(node, cls)
                utils.append_class(node, class_map[cls])
Пример #13
0
def fix_galleries(root):
    for gallery in root.xpath('.//ul[contains(@class, "gallery")]'):
        for leaf in gallery.xpath(".//*"):
            utils.remove_node_width(leaf)
            utils.remove_node_height(leaf)
            utils.remove_node_styles(leaf, "margin")
        for leaf in gallery.xpath('.//li[contains(@class, "gallerybox")]'):
            utils.append_class(leaf, "col-4")
            img = leaf[0][0][0][0][0]
            utils.append_class(img, "thumbimage")
            url = img.attrib.get("src")
            utils.add_node_style(leaf[0][0][0], "background-image", "url({})".format(url))
Пример #14
0
def markup_maps(root):
    target_node = "//div[{}]"
    conditions = [
        'contains(@class, "thumb")',
        'not(contains(@class, "thumbinner"))',
        'not(contains(@class, "thumbcaption"))',
        'not(contains(@class, "thumbimage"))',
        './/div[contains(@style, "relative") and .//div[contains(@style, "absolute")]]',
    ]
    '//div[@class="mw-parser-output"]//div[contains(@style, "relative") and .//div[contains(@style, "absolute")]]'
    for node in root.xpath(target_node.format(" and ".join(conditions))):
        utils.append_class(node, "map")
Пример #15
0
def add_class_to_infobox_wide_images(root):
    """
    add `infobox-wide` to images wider than 100px in an infobox and remove explicit width
    """
    for node in root.xpath('//*[contains(@class, "infobox")]//img'):
        if "width" in node.attrib and int(node.attrib.get("width")) > 100:
            utils.append_class(node, "infobox-img-wide")
            utils.remove_node_width(node)
            utils.remove_node_height(node)
            for td in node.xpath("./ancestor::td"):
                utils.append_class(td, "contains-img-wide")
        elif "width" in node.attrib and int(node.attrib.get("width")) <= 100:
            node.attrib["width"] = str(int(node.attrib["width"]) / config.px2pt)
Пример #16
0
def handle_two_col(node, width, height, reg_width, ext_width, debug):
    """
    span node across two columns (to extended width)
    - if it is wider than the regular width
    """
    if reg_width < width <= ext_width:
        if height > config.max_two_col_float_height:
            if debug:
                utils.add_node_style(node, "background-color", "orange")
            utils.append_class(node, "pp_singlecol")
        else:
            utils.append_class(node, "pp_twocol_span")
            if debug:
                utils.add_node_style(node, "background-color", "yellow")
Пример #17
0
def add_pagebreaks(root, article):
    if "page-break-before" in article:
        for xp in article["page-break-before"]:
            nodelist = root.xpath(xp)
            for node in nodelist:
                utils.append_class(node, "page-break-before")

    if "page-break-after" in article:
        for xp in article["page-break-after"]:
            nodelist = root.xpath(xp)
            for node in nodelist:
                utils.append_class(node, "page-break-after")

    return root
Пример #18
0
def check_size(article):
    for img in article.dom.xpath(
        '//img[not(substring(@src, string-length(@src)-3) = ".svg"'
        ' or substring(@src, string-length(@src)-3) = ".SVG")]'
    ):
        if not node_has_valid_image_src(img):
            continue
        path = img.get("src")
        if os.name == "nt":
            regex = "%2[fF]|%5[cC]"
            path = re.sub(regex, "/", path)
        if not os.path.exists(path):
            continue
        im = Image.open(path)
        width, height = im.size
        physical_width_in = config.px2in * float(img.get("width"))
        ppi = int(round(width / physical_width_in))
        img.set("data-ppi", str(ppi) + "ppi")
        img.set("data-source-image-width", str(width) + "px")
        if ppi < 240:
            utils.append_class(img, "low-ppi")
Пример #19
0
def identify_infoboxes(root):
    for table in root.xpath('//table[not(contains(@class, "infobox"))]'):
        if any("infobox" in val.lower() for val in table.values()):
            utils.append_class(table, "infobox")

    # https://de.wikipedia.org/wiki/Das_M%C3%A4dchen_auf_dem_Meeresgrund
    # tables less than 3 siblings away from article start are considered infoboxes
    # if they are wrapped in container nodes, the containers are stripped if
    # no siblings are present - otherwise the table is *not* marked as an infobox
    path = ('//h1[@class="firstHeading"]/'
            "following-sibling::*[position()<3]/"
            'descendant-or-self::table[not(contains(@class, "infobox"))]')
    for table in root.xpath(path):
        ancestors = [
            node for node in table.iterancestors() if (node.tag != "article")
        ]
        if any(len(node.getchildren()) != 1 for node in ancestors):
            continue
        if len(ancestors):
            container = ancestors[-1]
            container.getparent().replace(container, table)
        utils.append_class(table, "infobox")
Пример #20
0
def add_figure_numbers(root):
    classes = [
        "pp_singlecol",
        # 'infobox',  # infoboxes are not referenced despite floating
        "pp_figure",
        "pp_twocol_span",
    ]
    pred = " or ".join('contains(@class, "{}")'.format(cls) for cls in classes)
    total_figures = 0
    for article in root.xpath("//article"):
        figure_num = 0
        for node in article.xpath(".//*[{}]".format(pred)):
            utils.remove_class(node, "infobox")
            figure_num += 1
            total_figures += 1
            cls = [c for c in classes if c in node.get("class")][0]
            nr = ".".join([article.get("pp_article_num"), str(figure_num)])
            caption_txt = "Figure {nr} ".format(nr=nr)
            reference = E.p({"class": "pp_figure_ref"}, u"\u21AA " + caption_txt)
            if cls == "pp_figure":
                caption = node.xpath('.//*[contains(@class, "thumbcaption")]')
                if caption:
                    node.addnext(reference)
                    caption = caption[0]
                    prefix = E.b(caption_txt)
                    caption.insert(0, prefix)
                    prefix.tail = caption.text
                    caption.text = None
                    utils.append_class(caption, "pp_figure_caption")
                    continue
            wrapper = utils.wrap_node(node, "div", {"class": cls})
            caption = E.div({"class": "pp_figure_caption"}, E.b(caption_txt))
            wrapper.append(caption)
            utils.remove_class(node, cls)
            wrapper.addnext(reference)
    _combine_references(root)
Пример #21
0
def handle_table_width(node, width):
    """
    set table width
    - according to "natural" size and width attribute
    """
    if node.tag == "table":
        if width <= config.reg_width:
            utils.append_class(node, "reg-table")

        # tables blown up by width attributes
        if node.get("width") and width > config.reg_width:
            node.attrib.pop("width")
            utils.append_class(node, "wide-table")

        if node.getparent().tag == "div":
            if config.reg_width < width <= config.ext_width:
                utils.append_class(node.getparent(), "wide-table")
Пример #22
0
def resize_node_width_to_columns(node, width_in_pt, use_thirds_only=True):
    """
    resizes a given node to columns by adding a col-* class
    """
    utils.remove_node_width(node)
    target_col_width = next(
        (width for width in config.columns.values() if width > width_in_pt), 0)
    if target_col_width == 0:
        if width_in_pt <= config.tolerated_over_width:
            utils.wrap_node(node, "div", {"class": "over-wide-wrapper"})
            utils.append_class(node, "over-wide")
        else:
            utils.append_class(node, "rotated-table")
        return
    cols = config.columns.values().index(target_col_width) + 1
    if use_thirds_only:
        cols = int(4 * ceil(float(cols) / 4))
    utils.append_class(node, "col-{}".format(cols))
Пример #23
0
def markup_short_tables(root):
    for my_table in root.xpath("//table"):
        if 0 < len(my_table.xpath("descendant::tr")) < 20:
            utils.append_class(my_table, "short-table")
Пример #24
0
def apply_article_options(root, options=""):
    if "notext" in options:
        article = root.find(".//article")
        utils.append_class(article, "nodisplay")
Пример #25
0
def markup_floated_tables(root):
    for my_table in root.xpath("//table"):
        styles = utils.get_node_style(my_table)
        if "float" in styles and styles["float"] == "right":
            utils.append_class(my_table, "right-floated-table")
Пример #26
0
def h1_add_no_top_margin(root):
    # add no-top-margin class to h1 in articles that immediately follow a chapter
    for h1 in root.xpath(
        '//article[@class="pp_chapter" and count(*) = 1]/following-sibling::article[1]'
    ):
        utils.append_class(h1, "no-top-margin")
Пример #27
0
def add_center_class(root):
    for node in root.xpath('//div[contains(@style, "text-align:center")]'):
        utils.append_class(node, "center")
Пример #28
0
def mark_img_container(root):
    """https://de.wikipedia.org/wiki/Chaoyang_%28Shantou%29"""
    for img_container in root.xpath('//article/div/*[self::div[contains(@class,"thumb ")]]'):
        utils.append_class(img_container, "pp_figure")