def test_line_break_tags(): html = """a<p>b<br>c<h1>d<h2>e<h3>f<h4>g<h5>h<h6>i<pre>j<address>k<blockquote>l <dl>m<div>n<fieldset>o<form>p<hr>q<ol>r<ul>s<li>t """ doc = process_html(html) expect_doc = Doc().append_blocks([ TextParagraph().append_text_segment(TextSegment("a")), TextParagraph().append_text_segment(TextSegment("b")), TextParagraph().append_text_segment(TextSegment("c")), TextParagraph().append_text_segment(TextSegment("d")), TextParagraph().append_text_segment(TextSegment("e")), TextParagraph().append_text_segment(TextSegment("f")), TextParagraph().append_text_segment(TextSegment("g")), TextParagraph().append_text_segment(TextSegment("h")), TextParagraph().append_text_segment(TextSegment("i")), TextParagraph().append_text_segment(TextSegment("j")), TextParagraph().append_text_segment(TextSegment("k")), TextParagraph().append_text_segment(TextSegment("l")), TextParagraph().append_text_segment(TextSegment("m")), TextParagraph().append_text_segment(TextSegment("n")), TextParagraph().append_text_segment(TextSegment("o")), TextParagraph().append_text_segment(TextSegment("p")), TextParagraph().append_text_segment(TextSegment("q")), TextParagraph().append_text_segment(TextSegment("r")), TextParagraph().append_text_segment(TextSegment("s")), TextParagraph().append_text_segment(TextSegment("t")), ]) assert doc.to_dict() == expect_doc.to_dict()
def _process_text(text: str) -> Doc: "Create Dolphin Doc from plain text" doc = Doc() for line in text.splitlines(): line = line.strip() if line: par = TextParagraph().append_text_segment(TextSegment(line)) doc.append_block(par) return doc
def test_ignore_tags(): html = """a<style>b</style><script>c</script><noscript>d</noscript>e""" doc = process_html(html) expect_doc = Doc().append_blocks([ TextParagraph().append_text_segment(TextSegment("ae")), ]) assert doc.to_dict() == expect_doc.to_dict()
def test_plain_text(): text = "paragraph 1\nparagraph 2\n\n \n \nparagraph 3\n" doc = process(Content(data=text)) par1 = TextParagraph().append_text_segment(TextSegment("paragraph 1")) par2 = TextParagraph().append_text_segment(TextSegment("paragraph 2")) par3 = TextParagraph().append_text_segment(TextSegment("paragraph 3")) expect_doc = Doc().append_blocks([par1, par2, par3]) assert doc.to_dict() == expect_doc.to_dict()
def test_plain_text_from_file(): doc = process( Content(source=ContentSource.FILE, path="dolphin_doc_lib/testdata/plain_text.txt")) par1 = TextParagraph().append_text_segment(TextSegment("paragraph 1")) par2 = TextParagraph().append_text_segment(TextSegment("paragraph 2")) par3 = TextParagraph().append_text_segment(TextSegment("paragraph 3")) par4 = TextParagraph().append_text_segment(TextSegment("paragraph 4")) expect_doc = Doc().append_blocks([par1, par2, par3, par4]) assert doc.to_dict() == expect_doc.to_dict()
def test_standard_table(): html = """ <table> <thead> <tr> <th>Month</th> <th>Savings</th> </tr> </thead> <tbody> <tr> <td>January</td> <td>$100</td> </tr> <tr> <td>February</td> <td>$80</td> </tr> </tbody> <tfoot> <tr> <td>Sum</td> <td>$180</td> </tr> </tfoot> </table>""" doc = process_html(html) expect_doc = Doc().append_block( Table(4, 2, [ Cell(Rect[int](0, 0, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("Month"))), Cell(Rect[int](1, 0, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("Savings"))), Cell(Rect[int](0, 1, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("January"))), Cell(Rect[int](1, 1, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("$100"))), Cell(Rect[int](0, 2, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("February"))), Cell(Rect[int](1, 2, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("$80"))), Cell(Rect[int](0, 3, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("Sum"))), Cell(Rect[int](1, 3, 1, 1)).append_paragraph( TextParagraph().append_text_segment(TextSegment("$180"))), ])) assert doc.to_dict() == expect_doc.to_dict()
def process_html(html: str) -> Doc: "Create Dolphin Doc from html" root = BeautifulSoup(html, 'html5lib').body blocks_info = cast(BlocksInfo, _process(root)) doc = Doc().append_blocks(blocks_info.blocks) return doc