def test_extract_from_head_tag_empty_head(self, processing_options): """Test passing correct tag""" html = "<head></head>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('head') assert tag.name == 'head' result = html_data_extractors.extract_from_head_tag( tag, processing_options) assert result is None
def test_extract_from_div_incorrect_tag(self, processing_options): """Test passing incorrect tag""" html = "<title>My Title</title>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('title') assert tag.name == 'title' expected = None result = html_data_extractors.extract_from_div(tag, processing_options) assert result == expected
def test_extract_from_title(self, processing_options): """Test passing correct tag""" html = '<title>My Title</title>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('title') assert tag.name == 'title' result = html_data_extractors.extract_from_title( tag, processing_options) assert isinstance(result, Title) assert result.contents == 'My Title'
def test_extract_from_tag_with_coloured_text_span_no_color_in_style( self, processing_options): """Test passing correct tag""" html = '<span class="font-color" style="">This is coloured.</span>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('span') assert tag.name == 'span' result = html_data_extractors.extract_from_tag(tag, processing_options) assert isinstance(result, list) assert result[0].contents == 'This is coloured.'
def test_extract_from_iframe(self, processing_options): """Test passing correct tag""" html = '<iframe>My iframe</iframe>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('iframe') assert tag.name == 'iframe' result = html_data_extractors.extract_from_iframe( tag, processing_options) assert isinstance(result, TextItem) assert result.contents == '<iframe>My iframe</iframe>'
def test_extract_from_heading_incorrect_tag(self, html, tag_name, processing_options): """Test passing incorrect tag""" soup = helper_functions.make_soup_from_html(html) tag = soup.find(tag_name) assert tag.name == tag_name expected = None result = html_data_extractors.extract_from_heading( tag, processing_options, None) assert result == expected
def test_extract_from_unknown_span_incorrect_tag(self, processing_options): """Test passing incorrect tag""" html = "<body>My Body</body>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('body') assert tag.name == 'body' expected = None result = html_data_extractors.extract_from_unknown_span( tag, processing_options, None) assert result == expected
def test_extract_from_div(self, processing_options): """Test passing correct tag""" html = "<div><title>My Title</title></div>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('div') assert tag.name == 'div' result = html_data_extractors.extract_from_div(tag, processing_options) assert isinstance(result, Paragraph) assert len(result.contents) == 1 assert isinstance(result.contents[0], Title) assert result.contents[0].contents == 'My Title'
def test_extract_from_unknown_span(self, processing_options): """Test passing correct tag""" html = '<span>a span</span>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('span') assert tag.name == 'span' result = html_data_extractors.extract_from_unknown_span( tag, processing_options, None) assert isinstance(result, list) assert isinstance(result[0], TextItem) assert result[0].contents == 'a span'
def test_extract_from_hyperlink(self, html, href, display_text, processing_options): """Test passing correct tag""" soup = helper_functions.make_soup_from_html(html) tag = soup.find('a') assert tag.name == 'a' result = html_data_extractors.extract_from_hyperlink( tag, processing_options) assert isinstance(result, Hyperlink) assert result.href == href assert result.contents == display_text
def test_extract_from_div_two_child_divs(self, processing_options): """Test passing correct tag""" # html = "<div><div><div>My Div</div></div></div>" html = "<div><div><div><br></div></div></div>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('div') assert tag.name == 'div' result = html_data_extractors.extract_from_div(tag, processing_options) assert isinstance(result, list) assert len(result[0].contents) == 1 assert isinstance(result[0], Paragraph)
def test_extract_from_coloured_text_span_no_style(self, processing_options): """Test passing correct tag""" html = '<span>This is coloured.</span>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('span') assert tag.name == 'span' expected = None result = html_data_extractors.extract_from_coloured_text_span( tag, processing_options) assert result == expected
def test_nimbus_outline_html_output(self, processing_options): """Test passing correct tag""" html = '<div class="outline" id="b406348235_764"><div contenteditable="false" class="outline-container"><div class="outline-content-wrapper "><div class="outline-header "><div class="outline-left"><div class="outline-expand-icon "> </div></div><div class="outline-name">Outline</div></div><div class="outline-body"><ul class="outline-list outline-numbered"><li class="outline-list-item level-0"><a href="#b1023299123_950">A test note of page content</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1009">Testing lists</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1042">Testing inserted files</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1086">Testing a table</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1130">There are only 3 levels of heading in nimbus</a></li><li class="outline-list-item level-0"><a href="#b788977277_831">heading 1</a></li><li class="outline-list-item level-1"><a href="#b788977277_860">heading 2</a></li><li class="outline-list-item level-2"><a href="#b788977277_889">heading 3</a></li><li class="outline-list-item level-0"><a href="#b1023299123_1757">heading with italic text</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1218">Testing the horizontal line</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1266">Link and embeds</a></li><li class="outline-list-item level-1"><a href="#b992245780_93">Code Blocks</a></li><li class="outline-list-item level-1"><a href="#b992245780_132">Nimbus mentions</a></li><li class="outline-list-item level-1"><a href="#b992245780_175">Quoted text</a></li><li class="outline-list-item level-1"><a href="#b992245780_196">Hints</a></li><li class="outline-list-item level-1"><a href="#b992245780_220">Toggle block</a></li><li class="outline-list-item level-1"><a href="#b2183561539_350">Outline (effectively a linked TOC)</a></li><li class="outline-list-item level-1"><a href="#b992245780_450">Nimbus button</a></li><li class="outline-list-item level-1"><a href="#b992245780_478">Text formatting</a></li><li class="outline-list-item level-1"><a href="#b942953620_901">Testing inserted mp3</a></li><li class="outline-list-item level-1"><a href="#b942953620_1059">Test block sections - may or may not export!</a></li><li class="outline-list-item level-1"><a href="#b216345050_62">Adventures in Exporting from Nimbus Notes...</a></li><li class="outline-list-item level-0"><a href="#b942953620_969">This is the end of the file</a></li></ul></div></div></div></div>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('div') assert tag.name == 'div' expected = """<h2>Outline</h2><h4><ol><li><a href="#b1023299123_950">A test note of page content</a></li><ol><li><a href="#b1023299123_1009">Testing lists</a></li><li><a href="#b1023299123_1042">Testing inserted files</a></li><li><a href="#b1023299123_1086">Testing a table</a></li><li><a href="#b1023299123_1130">There are only 3 levels of heading in nimbus</a></li></ol><li><a href="#b788977277_831">heading 1</a></li><ol><li><a href="#b788977277_860">heading 2</a></li><ol><li><a href="#b788977277_889">heading 3</a></li></ol></ol><li><a href="#b1023299123_1757">heading with italic text</a></li><ol><li><a href="#b1023299123_1218">Testing the horizontal line</a></li><li><a href="#b1023299123_1266">Link and embeds</a></li><li><a href="#b992245780_93">Code Blocks</a></li><li><a href="#b992245780_132">Nimbus mentions</a></li><li><a href="#b992245780_175">Quoted text</a></li><li><a href="#b992245780_196">Hints</a></li><li><a href="#b992245780_220">Toggle block</a></li><li><a href="#b2183561539_350">Outline (effectively a linked TOC)</a></li><li><a href="#b992245780_450">Nimbus button</a></li><li><a href="#b992245780_478">Text formatting</a></li><li><a href="#b942953620_901">Testing inserted mp3</a></li><li><a href="#b942953620_1059">Test block sections - may or may not export!</a></li><li><a href="#b216345050_62">Adventures in Exporting from Nimbus Notes...</a></li></ol><li><a href="#b942953620_969">This is the end of the file</a></li></ol></h4>""" result = html_nimbus_extractors.extract_from_nimbus_outline(tag, processing_options) assert isinstance(result, Outline) assert result.html() == expected
def test_extract_from_p_or_i_tag(self, html, tag_name, expected, processing_options): """Test passing correct tag""" soup = helper_functions.make_soup_from_html(html) tag = soup.find(tag_name) assert tag.name == tag_name result = html_data_extractors.extract_from_p_or_i_tag( tag, processing_options) assert isinstance(result, list) assert len(result) == 1 assert isinstance(result[0], expected) assert result[0].contents == 'Some Text'
def test_extract_from_head_tag(self, processing_options): """Test passing correct tag""" html = "<head><title>My Title</title></head>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('head') assert tag.name == 'head' result = html_data_extractors.extract_from_head_tag( tag, processing_options) assert isinstance(result, Head) assert len(result.contents) == 1 assert isinstance(result.contents[0], Title) assert result.contents[0].contents == 'My Title'
def test_extract_from_blockquote(self, processing_options): """Test passing correct tag""" html = '<blockquote cite="my-citation">My Quote</blockquote>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('blockquote') assert tag.name == 'blockquote' result = html_data_extractors.extract_from_blockquote( tag, processing_options) assert isinstance(result, BlockQuote) assert len(result.contents) == 1 assert isinstance(result.contents[0], TextItem) assert result.contents[0].contents == 'My Quote'
def test_extract_from_body(self, processing_options): """Test passing correct tag""" html = "<section><title>My Title</title></section>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('section') assert tag.name == 'section' result = html_data_extractors.extract_from_section( tag, processing_options) assert isinstance(result, SectionContent) assert len(result.contents) == 1 assert isinstance(result.contents[0], Title) assert result.contents[0].contents == 'My Title'
def test_extract_from_coloured_text_span(self, processing_options): """Test passing correct tag""" html = '<span class="font-color" style="color: rgb(237, 84, 84);">This is coloured.</span>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('span') assert tag.name == 'span' result = html_data_extractors.extract_from_coloured_text_span( tag, processing_options) assert isinstance(result, TextColorItem) assert result.contents == '<span style="color: rgb(237, 84, 84);">This is coloured.</span>' assert result.plain_text == 'This is coloured.' assert result.processing_options == processing_options
def test_extract_text_formatting(self, processing_options): """Test passing correct tag""" html = "<strong>bold text</strong>" soup = helper_functions.make_soup_from_html(html) tag = soup.find('strong') assert tag.name == 'strong' result = html_data_extractors.extract_text_formatting( tag, markdown_format_styling.format_styling, processing_options) assert isinstance(result, TextFormatItem) assert result.format == 'strong' assert result.contents[0].contents == 'bold text' assert result.processing_options == processing_options
def test_extract_from_heading(self, html, tag_name, expected_level, expected_id, processing_options): """Test passing correct tag, confirm heading levels are restricted to 1-6""" soup = helper_functions.make_soup_from_html(html) tag = soup.find(tag_name) assert tag.name == tag_name result = html_data_extractors.extract_from_heading( tag, processing_options, None) assert isinstance(result, HeadingItem) assert len(result.contents) == 1 assert result.level == expected_level assert result.id == expected_id assert isinstance(result.contents[0], TextItem) assert result.contents[0].contents == 'My heading'
def test_extract_from_image_tag(self, html, src, alt, width, height, processing_options): """Test passing correct tag""" soup = helper_functions.make_soup_from_html(html) tag = soup.find('img') assert tag.name == 'img' result = html_data_extractors.extract_from_img_tag( tag, processing_options) assert isinstance(result, ImageEmbed) assert result.href == src assert result.contents == alt assert result.width == width assert result.height == height assert result.source_path == Path(src) assert result.filename == Path(src).name
def test_nimbus_outline_markdown_output(self, processing_options): """Test passing correct tag""" html = '<div class="outline" id="b406348235_764"><div contenteditable="false" class="outline-container"><div class="outline-content-wrapper "><div class="outline-header "><div class="outline-left"><div class="outline-expand-icon "> </div></div><div class="outline-name">Outline</div></div><div class="outline-body"><ul class="outline-list outline-numbered"><li class="outline-list-item level-0"><a href="#b1023299123_950">A test note of page content</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1009">Testing lists</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1042">Testing inserted files</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1086">Testing a table</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1130">There are only 3 levels of heading in nimbus</a></li><li class="outline-list-item level-0"><a href="#b788977277_831">heading 1</a></li><li class="outline-list-item level-1"><a href="#b788977277_860">heading 2</a></li><li class="outline-list-item level-2"><a href="#b788977277_889">heading 3</a></li><li class="outline-list-item level-0"><a href="#b1023299123_1757">heading with italic text</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1218">Testing the horizontal line</a></li><li class="outline-list-item level-1"><a href="#b1023299123_1266">Link and embeds</a></li><li class="outline-list-item level-1"><a href="#b992245780_93">Code Blocks</a></li><li class="outline-list-item level-1"><a href="#b992245780_132">Nimbus mentions</a></li><li class="outline-list-item level-1"><a href="#b992245780_175">Quoted text</a></li><li class="outline-list-item level-1"><a href="#b992245780_196">Hints</a></li><li class="outline-list-item level-1"><a href="#b992245780_220">Toggle block</a></li><li class="outline-list-item level-1"><a href="#b2183561539_350">Outline (effectively a linked TOC)</a></li><li class="outline-list-item level-1"><a href="#b992245780_450">Nimbus button</a></li><li class="outline-list-item level-1"><a href="#b992245780_478">Text formatting</a></li><li class="outline-list-item level-1"><a href="#b942953620_901">Testing inserted mp3</a></li><li class="outline-list-item level-1"><a href="#b942953620_1059">Test block sections - may or may not export!</a></li><li class="outline-list-item level-1"><a href="#b216345050_62">Adventures in Exporting from Nimbus Notes...</a></li><li class="outline-list-item level-0"><a href="#b942953620_969">This is the end of the file</a></li></ul></div></div></div></div>' soup = helper_functions.make_soup_from_html(html) tag = soup.find('div') assert tag.name == 'div' expected = """## Outline 1. [A test note of page content](#a-test-note-of-page-content) 1. [Testing lists](#testing-lists) 2. [Testing inserted files](#testing-inserted-files) 3. [Testing a table](#testing-a-table) 4. [There are only 3 levels of heading in nimbus](#there-are-only-3-levels-of-heading-in-nimbus) 2. [heading 1](#heading-1) 1. [heading 2](#heading-2) 1. [heading 3](#heading-3) 3. [heading with italic text](#heading-with-italic-text) 1. [Testing the horizontal line](#testing-the-horizontal-line) 2. [Link and embeds](#link-and-embeds) 3. [Code Blocks](#code-blocks) 4. [Nimbus mentions](#nimbus-mentions) 5. [Quoted text](#quoted-text) 6. [Hints](#hints) 7. [Toggle block](#toggle-block) 8. [Outline (effectively a linked TOC)](#outline--effectively-a-linked-toc-) 9. [Nimbus button](#nimbus-button) 10. [Text formatting](#text-formatting) 11. [Testing inserted mp3](#testing-inserted-mp3) 12. [Test block sections - may or may not export!](#test-block-sections---may-or-may-not-export-) 13. [Adventures in Exporting from Nimbus Notes...](#adventures-in-exporting-from-nimbus-notes...) 4. [This is the end of the file](#this-is-the-end-of-the-file) """ processing_options.export_format = 'gfm' result = html_nimbus_extractors.extract_from_nimbus_outline(tag, processing_options) assert isinstance(result, Outline) assert result.markdown() == expected
def test_make_soup(): html = '<p>hello</p>' result = helper_functions.make_soup_from_html(html) assert isinstance(result, BeautifulSoup)