def clone_beautiful_soup_tag(elements): """ :type element: Tag or NavigableString or BeautifulSoup :rtype: Tag or NavigableString or BeautifulSoup """ if elements is None: raise ElementTypeError('elements is None!') if isinstance(elements, (Tag, NavigableString, BeautifulSoup)): element = elements if isinstance(element, NavigableString): return type(element)(element) copy = Tag(None, element.builder, element.name, element.namespace, element.nsprefix) # work around bug where there is no builder set # https://bugs.launchpad.net/beautifulsoup/+bug/1307471 copy.attrs = dict(element.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(copy, attr, getattr(element, attr)) for child in element.contents: copy.append(clone_beautiful_soup_tag(child)) return copy else: return [clone_beautiful_soup_tag(x) for x in elements]
def flatten_html(html): """ Create a flat new html soup using the class_corrected thing above So if we have: <div class="A1 B1"> <div class="A2 C2">inner1</div> <span class="A2 D2">inner2</span> </div> then we create: [ { "class": [ "A1", "C2", "A2", "B1" ], "data": "inner1", "name": "div" }, { "class": [ "A1", "A2", "D2", "B1" ], "data": "inner2", "name": "span" } ] """ soup = HTMLUtil.get_soup_from_html(html) output = [] index = 0 for ch in soup.body.recursiveChildGenerator(): if isinstance(ch, NavigableString): if ch.strip(): x = Tag(name=ch.parent.name) x.attrs = ch.parent.attrs x.string = ch if ch.parent.name == "style" or \ ch.parent.name == "script" or \ ch.parent.name == "code": continue output.append({ "data": ch, "name": ch.parent.name, "class": ch.parent.attrs.get("class", []), "index": index }) index = index + 1 else: ch.attrs = merge_map(ch.attrs, ch.parent.attrs) return output
def insert_tag(self, tag_dict): """docstring for insert_tag""" tag = Tag(name=tag_dict.pop('name')) tag.attrs = tag_dict if not self.findAll('TAGS'): self.root.append(Tag(name='TAGS')) self.TAGS.append(tag) self.TAGS.append('\n')
def clone(el): if isinstance(el, NavigableString): return type(el)(el) copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix) # work around bug where there is no builder set # https://bugs.launchpad.net/beautifulsoup/+bug/1307471 copy.attrs = dict(el.attrs) for attr in ('can_be_empty_element', 'hidden'): setattr(copy, attr, getattr(el, attr)) for child in el.contents: copy.append(clone(child)) return copy
def test_default_attributes(self): # Test the default behavior of Formatter.attributes(). formatter = Formatter() tag = Tag(name="tag") tag['b'] = 1 tag['a'] = 2 # Attributes come out sorted by name. In Python 3, attributes # normally come out of a dictionary in the order they were # added. assert [('a', 2), ('b', 1)] == formatter.attributes(tag) # This works even if Tag.attrs is None, though this shouldn't # normally happen. tag.attrs = None assert [] == formatter.attributes(tag) assert ' ' == formatter.indent
def _prepare(doc): doc.auto_styles = getattr(doc, "auto_styles", []) custom_styles_root = BeautifulSoup("", "xml") custom_styles_root.append( utils.create_nested_tags( **{ "name": "style:style", "attrs": { "style:name": "Keep_20_Caption_With_Next", "style:family": "paragraph", "style:parent-style-name": "Caption", }, "contents": [{ "name": "style:paragraph-properties", "attrs": { "fo:keep-together": "always" }, }], })) custom_styles_root.append( utils.create_nested_tags( **{ "name": "style:style", "attrs": { "style:name": "Vertical", "style:family": "text" }, "contents": [{ "name": "style:text-properties", "attrs": { "style:text-rotation-angle": "90", "style:text-rotation-scale": "line-height", }, }], })) custom_styles_root.append( utils.create_nested_tags( **{ "name": "style:style", "attrs": { "style:name": "Table_20_Legend", "style:family": "paragraph", "style:parent-style-name": "Standard", }, "contents": [{ "name": "style:paragraph-properties", "attrs": { "fo:margin-left": "0.1799in", "fo-margin-right": "0in", "fo-margin-top": "0.1598in", "fo-margin-bottom": "0.2in", "loext:contextual-spacing": "true", "fo:text-indent": "0in", }, }], })) doc.auto_styles.extend( ["\n".join(cs.prettify() for cs in custom_styles_root.contents)]) sequence_decls_root = BeautifulSoup("", "xml") sequence_decls = Tag(name="text:sequence-decls") sequence_decls_root.contents.append(sequence_decls) for n in ["Illustration", "Table", "Text", "Drawing"]: t = Tag(name="text:sequence-decl") t.attrs = { "text:display-outline-level": str(doc.get_metadata("outline-level", "1")), "text:name": n, "text:seperation-character": ".", } sequence_decls.contents.append(t) doc.sequence_decls = [ "\n".join(sd.prettify() for sd in sequence_decls_root.contents) ]
def render_table_odt(elem, doc): table = elem.content[0] table_number = tuple( str(i) for i in utils.get_elem_count(doc, pf.Table, register="table")) table_name = "Table{}".format("_".join(str(i) for i in table_number)) # table_root = BeautifulSoup("", "xml") if hasattr(table, "caption") and table.caption: colon = ": " caption = "".join(pf.stringify(c) for c in table.caption) else: colon = "" caption = "" caption_odt = utils.create_nested_tags( **{ "name": "text:p", "attrs": { "text:style-name": "Table" }, "contents": [ { "name": "text:span", "attrs": { "text:style-name": "Strong_20_Emphasis" }, "contents": [ "Table ", { "name": "text:sequence", "attrs": { "text:ref-name": f"ref{table_name}", "text:name": "Table", "text:formula": "ooow:Table+1", "style:num-format": "1", }, "contents": [".".join(table_number)], }, colon, ], }, caption, ], }) table_root.contents.append(caption_odt) table_odt = utils.create_nested_tags( **{ "name": "table:table", "attrs": { "table:name": table_name, "table:style-name": table_name, "table:template-name": "Default Style", }, }) table_root.contents.append(table_odt) unoccupied_width = 1 - sum(table.width) unspecified_widths = len([w for w in table.width if not w]) remaining_for_each = unoccupied_width / unspecified_widths widths = [w if w else remaining_for_each for w in table.width] # We want the table to occupy a maximum width widths = map(lambda x: x * table.total_width, widths) column_style_names, column_styles, column_definitions = zip( *create_column_definitions(widths, table_name)) pf.debug(column_style_names, column_styles, column_definitions) styles = BeautifulSoup("", "xml") styles.contents = list(column_styles) table_odt.contents.extend(column_definitions) for r, row in enumerate(table.content): row_odt = Tag(name="table:table-row") row_odt.attrs = { "table:style-name": "{table_name}.{r}".format(table_name=table_name, r=r + 1) } row_cell_styles = [] for c, cell in enumerate(row.content): if cell.covered: cell_odt = Tag(name="table:covered-table-cell") row_odt.contents.append(cell_odt) row_cell_styles.append(None) else: cell_odt = Tag(name="table:table-cell") cell_style_name = "{column_style}{r}".format( column_style=column_style_names[c], r=r + 1) cell_style = Tag(name="style:style") cell_style.attrs = { "style:name": cell_style_name, "style:family": "table-cell", "style:writing-mode": "page", } style_cell_properies = Tag(name="style:table-cell-properties") style_cell_properies.attrs = { "fo:padding-left": "0.10cm", "fo:padding-right": "0.10cm", "fo:padding-top": "0.10cm", "fo:padding-bottom": "0.10cm", "style:vertical-align": "bottom", } style_background_image = Tag(name="style:background-image") style_cell_properies.contents.append(style_background_image) cell_style.contents.append(style_cell_properies) row_cell_styles.append(cell_style) cell_odt.attrs = { "table:style-name": cell_style_name, "office:value-type": "string", } if cell.col_span > 1: cell_odt.attrs[ "table:number-columns-spanned"] = cell.col_span if cell.content: cell_content = utils.panflute2output( cell.content, format="opendocument").strip() cell_content = BeautifulSoup(cell_content, "lxml").html.body text_p = re.compile("text:p") for t in cell_content.find_all(text_p): if cell.heading == 1: t["text:style-name"] = "Table_20_Heading" elif cell.heading == 2: t["text:style-name"] = "Table_20_Subheading" else: t["text:style-name"] = "Table_20_Contents" if cell.vertical: t_contents = t.contents t.contents = [ utils.create_nested_tags( **{ "name": "text:span", "attrs": { "text:style-name": "Vertical" }, "contents": t_contents, }) ] cell_odt.contents = cell_content.contents else: cell_content = Tag(name="text:p") cell_content.attrs = { "text:style-name": "Table_20_contents" } cell_odt.contents.append(cell_content) row_odt.contents.append(cell_odt) if row.underlines: for underline in row.underlines: start = underline[0] stop = underline[1] for i in range(start - 1, stop): cell_style = row_cell_styles[i] if cell_style is None: pass else: cell_style.contents[0].attrs[ "fo:border-bottom"] = "0.5pt solid #000000" add_top_space = table.content[r - 1].btm_space if r else False if row.top_space or add_top_space: for cell_style in row_cell_styles: if cell_style is not None: padding_top = cell_style.contents[0].attrs[ "fo:padding-top"] padding_top = (float(padding_top.strip("cm")) + 0.05 * add_top_space + 0.05 * row.top_space) cell_style.contents[0].attrs[ "fo:padding-top"] = f"{padding_top}cm" row_cell_styles = [cs for cs in row_cell_styles if cs is not None] styles.contents.extend(row_cell_styles) table_odt.contents.append(row_odt) try: footer = elem.content[1].content[0] except IndexError: footer = None if footer is not None: for definition_item in footer.content: term = "".join(pf.stringify(e) for e in definition_item.term) definitions = [ utils.panflute2output(d.content, format="opendocument") for d in definition_item.definitions ] definitions_parsed = BeautifulSoup("".join(definitions), "lxml").html.body.contents for t in definitions_parsed: if t.name == "text:p": t.name = "text:span" t.contents.insert(0, NavigableString(" ")) definition = utils.create_nested_tags( **{ "name": "text:p", "attrs": { "text:style-name": "Table_20_Legend" }, "contents": [{ "name": "text:span", "attrs": { "text:style-name": "Superscript" }, "contents": [term], }] + definitions_parsed, }) table_root.contents.append(definition) styles = "\n".join(c.prettify() for c in styles.contents) doc.auto_styles.append(styles) table = "\n".join(str(c) for c in table_root.contents) # pf.debug(table) return table