def test_docx_writing(): filename = 'test_data/small_table.docx' xml = get_docx_xml(filename) output = ''.join([random.choice('abcdef') for _ in range(10)]) + '.docx' # noqa: S311 write_new_document(filename, str(xml), output) xml_2 = get_docx_xml(output) assert xml == xml_2 os.remove(output)
def test_remove_tables_and_bodies(): xml = get_docx_xml('test_data/small_text.docx') soup = BeautifulSoup(xml, 'lxml-xml') new_soup, references_tb, references_body = _remove_tables_and_bodies(soup) assert len(references_tb) == 0 assert len(references_body) == 4 for ref in references_body: assert ref in str(new_soup)
def test_extract_tags(): xml = get_docx_xml('test_data/small_text.docx') soup = BeautifulSoup(xml, 'lxml-xml') new_soup, references = _extract_tags(soup, _find_first_r_tag) assert len(references) == 1 assert list(references.keys())[0] in str(new_soup) assert len(list(new_soup.stripped_strings)) != len( list(soup.stripped_strings))
def test_extract_headers(): assert extract_headers(BeautifulSoup('', 'html.parser')) == [] xml = get_docx_xml('test_data/small_text.docx') soup = BeautifulSoup(xml, 'lxml-xml') assert extract_headers(soup) == [ 'Article 6.2.3. Auto surveillance des niveaux sonores', 'Chapitre 6.3 – Vibrations', ]
def test_guess_body_font_size(): xml = get_docx_xml('test_data/small_text.docx') soup = BeautifulSoup(xml, 'lxml-xml') assert _guess_body_font_size(soup) == 24 xml = '<w></w>' soup = BeautifulSoup(xml, 'lxml-xml') with pytest.raises(DocxNoTextError): _guess_body_font_size(soup)
def test_build_structured_text_from_docx_xml(): xml = get_docx_xml('test_data/small_text.docx') res = build_structured_text_from_docx_xml(xml) assert res.title.text == '' assert len(res.sections) == 2 assert len(res.sections[0].sections) == 0 assert res.sections[ 0].title.text == 'Article 6.2.3. Auto surveillance des niveaux sonores' assert len(res.sections[1].sections) == 0 assert res.sections[1].title.text == 'Chapitre 6.3 – Vibrations'
def test_replace_small_tables(): filename = 'test_data/small_table.docx' xml_str = get_docx_xml(filename) soup = BeautifulSoup(xml_str, 'lxml-xml') assert len(list(soup.find_all('w:tbl'))) == 1 assert len(list(soup.find_all('w:p'))) == 5 assert len(list(soup.find_all('w:tc'))) == 3 soup = _replace_small_tables(soup) assert len(list(soup.find_all('w:tbl'))) == 0 assert len(list(soup.find_all('w:p'))) == 5 assert len(list(soup.find_all('w:tc'))) == 0
def test_replace_tables_and_body_text_with_empty_p(): xml = get_docx_xml('test_data/small_text.docx') soup = BeautifulSoup(xml, 'lxml-xml') new_soup = _replace_tables_and_body_text_with_empty_p(soup) assert list(new_soup.stripped_strings) == [ 'Article 6.2.3. Auto surveillance des niveaux sonores', 'Chapitre 6.3 – Vibrations', ] xml = '' soup = BeautifulSoup(xml, 'lxml-xml') new_soup = _replace_tables_and_body_text_with_empty_p(soup, 10) assert list(new_soup.stripped_strings) == []
def test_extract_elements(): xml = get_docx_xml('test_data/small_text.docx') soup = BeautifulSoup(xml, 'lxml-xml') elements = _extract_elements(soup) assert len(elements) == 6 for element in elements: assert not isinstance(element, Table) assert isinstance(elements[0], str) assert isinstance(elements[1], str) assert isinstance(elements[2], Title) and check_is_title( elements[2]).level == 3 assert isinstance(elements[3], str) assert isinstance(elements[4], Title) and check_is_title( elements[4]).level == 2 assert isinstance(elements[5], str)
def test_copy_soup(): filename = 'test_data/small_table.docx' xml_str = get_docx_xml(filename) soup = BeautifulSoup(xml_str, 'lxml-xml') soup_copy = _copy_soup(soup) assert id(soup) != id(soup_copy)
def test_get_docx_xml(): xml = get_docx_xml('test_data/simple_table.docx') assert len(xml) == 6580