示例#1
0
def clone_beautiful_soup_tag(elements):
	"""
	:type element: Tag or NavigableString or BeautifulSoup
	:rtype: Tag or NavigableString or BeautifulSoup
	"""
	if elements is None:
		raise ElementTypeError('elements is None!')

	if isinstance(elements, (Tag, NavigableString, BeautifulSoup)):
		element = elements
		if isinstance(element, NavigableString):
			return type(element)(element)

		copy = Tag(None, element.builder, element.name, element.namespace, element.nsprefix)

		# work around bug where there is no builder set
		# https://bugs.launchpad.net/beautifulsoup/+bug/1307471
		copy.attrs = dict(element.attrs)
		for attr in ('can_be_empty_element', 'hidden'):
			setattr(copy, attr, getattr(element, attr))
		for child in element.contents:
			copy.append(clone_beautiful_soup_tag(child))
		return copy
	else:
		return [clone_beautiful_soup_tag(x) for x in elements]
示例#2
0
    def flatten_html(html):
        """
        Create a flat new html soup using the class_corrected thing above

        So if we have:
        <div class="A1 B1">
            <div class="A2 C2">inner1</div>
            <span class="A2 D2">inner2</span>
        </div>

        then we create:

        [
            {
                "class": [
                    "A1",
                    "C2",
                    "A2",
                    "B1"
                ],
                "data": "inner1",
                "name": "div"
            },
            {
                "class": [
                    "A1",
                    "A2",
                    "D2",
                    "B1"
                ],
                "data": "inner2",
                "name": "span"
            }
        ]

        """
        soup = HTMLUtil.get_soup_from_html(html)
        output = []
        index = 0
        for ch in soup.body.recursiveChildGenerator():
            if isinstance(ch, NavigableString):
                if ch.strip():
                    x = Tag(name=ch.parent.name)
                    x.attrs = ch.parent.attrs
                    x.string = ch
                    if ch.parent.name == "style" or \
                            ch.parent.name == "script" or \
                            ch.parent.name == "code":
                        continue
                    output.append({
                        "data": ch,
                        "name": ch.parent.name,
                        "class": ch.parent.attrs.get("class", []),
                        "index": index
                    })
                    index = index + 1
            else:
                ch.attrs = merge_map(ch.attrs, ch.parent.attrs)
        return output
示例#3
0
 def insert_tag(self, tag_dict):
     """docstring for insert_tag"""
     tag = Tag(name=tag_dict.pop('name'))
     tag.attrs = tag_dict
     if not self.findAll('TAGS'):
         self.root.append(Tag(name='TAGS'))
     self.TAGS.append(tag)
     self.TAGS.append('\n')
示例#4
0
def clone(el):
    if isinstance(el, NavigableString):
        return type(el)(el)

    copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)
    # work around bug where there is no builder set
    # https://bugs.launchpad.net/beautifulsoup/+bug/1307471
    copy.attrs = dict(el.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(copy, attr, getattr(el, attr))
    for child in el.contents:
        copy.append(clone(child))
    return copy
示例#5
0
    def test_default_attributes(self):
        # Test the default behavior of Formatter.attributes().
        formatter = Formatter()
        tag = Tag(name="tag")
        tag['b'] = 1
        tag['a'] = 2

        # Attributes come out sorted by name. In Python 3, attributes
        # normally come out of a dictionary in the order they were
        # added.
        assert [('a', 2), ('b', 1)] == formatter.attributes(tag)

        # This works even if Tag.attrs is None, though this shouldn't
        # normally happen.
        tag.attrs = None
        assert [] == formatter.attributes(tag)

        assert ' ' == formatter.indent
示例#6
0
def _prepare(doc):
    doc.auto_styles = getattr(doc, "auto_styles", [])

    custom_styles_root = BeautifulSoup("", "xml")
    custom_styles_root.append(
        utils.create_nested_tags(
            **{
                "name":
                "style:style",
                "attrs": {
                    "style:name": "Keep_20_Caption_With_Next",
                    "style:family": "paragraph",
                    "style:parent-style-name": "Caption",
                },
                "contents": [{
                    "name": "style:paragraph-properties",
                    "attrs": {
                        "fo:keep-together": "always"
                    },
                }],
            }))

    custom_styles_root.append(
        utils.create_nested_tags(
            **{
                "name":
                "style:style",
                "attrs": {
                    "style:name": "Vertical",
                    "style:family": "text"
                },
                "contents": [{
                    "name": "style:text-properties",
                    "attrs": {
                        "style:text-rotation-angle": "90",
                        "style:text-rotation-scale": "line-height",
                    },
                }],
            }))

    custom_styles_root.append(
        utils.create_nested_tags(
            **{
                "name":
                "style:style",
                "attrs": {
                    "style:name": "Table_20_Legend",
                    "style:family": "paragraph",
                    "style:parent-style-name": "Standard",
                },
                "contents": [{
                    "name": "style:paragraph-properties",
                    "attrs": {
                        "fo:margin-left": "0.1799in",
                        "fo-margin-right": "0in",
                        "fo-margin-top": "0.1598in",
                        "fo-margin-bottom": "0.2in",
                        "loext:contextual-spacing": "true",
                        "fo:text-indent": "0in",
                    },
                }],
            }))

    doc.auto_styles.extend(
        ["\n".join(cs.prettify() for cs in custom_styles_root.contents)])

    sequence_decls_root = BeautifulSoup("", "xml")
    sequence_decls = Tag(name="text:sequence-decls")
    sequence_decls_root.contents.append(sequence_decls)

    for n in ["Illustration", "Table", "Text", "Drawing"]:
        t = Tag(name="text:sequence-decl")
        t.attrs = {
            "text:display-outline-level":
            str(doc.get_metadata("outline-level", "1")),
            "text:name":
            n,
            "text:seperation-character":
            ".",
        }
        sequence_decls.contents.append(t)

    doc.sequence_decls = [
        "\n".join(sd.prettify() for sd in sequence_decls_root.contents)
    ]
示例#7
0
def render_table_odt(elem, doc):
    table = elem.content[0]
    table_number = tuple(
        str(i) for i in utils.get_elem_count(doc, pf.Table, register="table"))
    table_name = "Table{}".format("_".join(str(i) for i in table_number))
    #
    table_root = BeautifulSoup("", "xml")

    if hasattr(table, "caption") and table.caption:
        colon = ": "
        caption = "".join(pf.stringify(c) for c in table.caption)
    else:
        colon = ""
        caption = ""

    caption_odt = utils.create_nested_tags(
        **{
            "name":
            "text:p",
            "attrs": {
                "text:style-name": "Table"
            },
            "contents": [
                {
                    "name":
                    "text:span",
                    "attrs": {
                        "text:style-name": "Strong_20_Emphasis"
                    },
                    "contents": [
                        "Table ",
                        {
                            "name": "text:sequence",
                            "attrs": {
                                "text:ref-name": f"ref{table_name}",
                                "text:name": "Table",
                                "text:formula": "ooow:Table+1",
                                "style:num-format": "1",
                            },
                            "contents": [".".join(table_number)],
                        },
                        colon,
                    ],
                },
                caption,
            ],
        })

    table_root.contents.append(caption_odt)

    table_odt = utils.create_nested_tags(
        **{
            "name": "table:table",
            "attrs": {
                "table:name": table_name,
                "table:style-name": table_name,
                "table:template-name": "Default Style",
            },
        })

    table_root.contents.append(table_odt)

    unoccupied_width = 1 - sum(table.width)
    unspecified_widths = len([w for w in table.width if not w])
    remaining_for_each = unoccupied_width / unspecified_widths

    widths = [w if w else remaining_for_each for w in table.width]

    # We want the table to occupy a maximum width
    widths = map(lambda x: x * table.total_width, widths)

    column_style_names, column_styles, column_definitions = zip(
        *create_column_definitions(widths, table_name))

    pf.debug(column_style_names, column_styles, column_definitions)

    styles = BeautifulSoup("", "xml")
    styles.contents = list(column_styles)

    table_odt.contents.extend(column_definitions)

    for r, row in enumerate(table.content):
        row_odt = Tag(name="table:table-row")
        row_odt.attrs = {
            "table:style-name":
            "{table_name}.{r}".format(table_name=table_name, r=r + 1)
        }

        row_cell_styles = []

        for c, cell in enumerate(row.content):

            if cell.covered:
                cell_odt = Tag(name="table:covered-table-cell")
                row_odt.contents.append(cell_odt)

                row_cell_styles.append(None)
            else:
                cell_odt = Tag(name="table:table-cell")

                cell_style_name = "{column_style}{r}".format(
                    column_style=column_style_names[c], r=r + 1)

                cell_style = Tag(name="style:style")
                cell_style.attrs = {
                    "style:name": cell_style_name,
                    "style:family": "table-cell",
                    "style:writing-mode": "page",
                }
                style_cell_properies = Tag(name="style:table-cell-properties")
                style_cell_properies.attrs = {
                    "fo:padding-left": "0.10cm",
                    "fo:padding-right": "0.10cm",
                    "fo:padding-top": "0.10cm",
                    "fo:padding-bottom": "0.10cm",
                    "style:vertical-align": "bottom",
                }
                style_background_image = Tag(name="style:background-image")
                style_cell_properies.contents.append(style_background_image)
                cell_style.contents.append(style_cell_properies)

                row_cell_styles.append(cell_style)

                cell_odt.attrs = {
                    "table:style-name": cell_style_name,
                    "office:value-type": "string",
                }

                if cell.col_span > 1:
                    cell_odt.attrs[
                        "table:number-columns-spanned"] = cell.col_span

                if cell.content:
                    cell_content = utils.panflute2output(
                        cell.content, format="opendocument").strip()

                    cell_content = BeautifulSoup(cell_content,
                                                 "lxml").html.body

                    text_p = re.compile("text:p")

                    for t in cell_content.find_all(text_p):
                        if cell.heading == 1:
                            t["text:style-name"] = "Table_20_Heading"
                        elif cell.heading == 2:
                            t["text:style-name"] = "Table_20_Subheading"
                        else:
                            t["text:style-name"] = "Table_20_Contents"

                        if cell.vertical:
                            t_contents = t.contents
                            t.contents = [
                                utils.create_nested_tags(
                                    **{
                                        "name": "text:span",
                                        "attrs": {
                                            "text:style-name": "Vertical"
                                        },
                                        "contents": t_contents,
                                    })
                            ]
                    cell_odt.contents = cell_content.contents
                else:
                    cell_content = Tag(name="text:p")
                    cell_content.attrs = {
                        "text:style-name": "Table_20_contents"
                    }
                    cell_odt.contents.append(cell_content)

                row_odt.contents.append(cell_odt)

        if row.underlines:
            for underline in row.underlines:
                start = underline[0]
                stop = underline[1]

                for i in range(start - 1, stop):
                    cell_style = row_cell_styles[i]

                    if cell_style is None:
                        pass
                    else:
                        cell_style.contents[0].attrs[
                            "fo:border-bottom"] = "0.5pt solid #000000"

        add_top_space = table.content[r - 1].btm_space if r else False

        if row.top_space or add_top_space:
            for cell_style in row_cell_styles:
                if cell_style is not None:
                    padding_top = cell_style.contents[0].attrs[
                        "fo:padding-top"]

                    padding_top = (float(padding_top.strip("cm")) +
                                   0.05 * add_top_space + 0.05 * row.top_space)

                    cell_style.contents[0].attrs[
                        "fo:padding-top"] = f"{padding_top}cm"

        row_cell_styles = [cs for cs in row_cell_styles if cs is not None]
        styles.contents.extend(row_cell_styles)

        table_odt.contents.append(row_odt)

    try:
        footer = elem.content[1].content[0]
    except IndexError:
        footer = None

    if footer is not None:
        for definition_item in footer.content:
            term = "".join(pf.stringify(e) for e in definition_item.term)

            definitions = [
                utils.panflute2output(d.content, format="opendocument")
                for d in definition_item.definitions
            ]
            definitions_parsed = BeautifulSoup("".join(definitions),
                                               "lxml").html.body.contents

            for t in definitions_parsed:
                if t.name == "text:p":
                    t.name = "text:span"
                    t.contents.insert(0, NavigableString(" "))

            definition = utils.create_nested_tags(
                **{
                    "name":
                    "text:p",
                    "attrs": {
                        "text:style-name": "Table_20_Legend"
                    },
                    "contents": [{
                        "name": "text:span",
                        "attrs": {
                            "text:style-name": "Superscript"
                        },
                        "contents": [term],
                    }] + definitions_parsed,
                })
            table_root.contents.append(definition)

    styles = "\n".join(c.prettify() for c in styles.contents)
    doc.auto_styles.append(styles)

    table = "\n".join(str(c) for c in table_root.contents)
    # pf.debug(table)

    return table