def testmakeelement(): '''Ensure custom elements get created''' docx = Docx() testelement = docx._makeelement('testname', attributes={'testattribute':'testvalue'}, tagtext='testtagtext') assert testelement.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testname' assert testelement.attrib == {'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testattribute': 'testvalue'} assert testelement.text == 'testtagtext'
def testtable(): '''Ensure tables make sense''' docx = Docx() testtable = docx.table([['A1', 'A2'], ['B1', 'B2'], ['C1', 'C2']]) ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' assert testtable.xpath('/ns0:tbl/ns0:tr[2]/ns0:tc[2]/ns0:p/ns0:r/ns0:t', namespaces={'ns0':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})[0].text == 'B2'
def testnewdocument(): '''Test that a new document can be created''' docx = Docx() docx.coreproperties('Python docx testnewdocument', 'A short example of making docx from Python', 'Alan Brooks', ['python', 'Office Open XML', 'Word']) docx.savedocx(TEST_FILE)
def testunsupportedpagebreak(): '''Ensure unsupported page break types are trapped''' docx = Docx() try: docx.pagebreak(type='unsup') except ValueError: return # passed assert False # failed
def testtable(): '''Ensure tables make sense''' docx = Docx() testtable = docx.table([['A1', 'A2'], ['B1', 'B2'], ['C1', 'C2']]) ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' assert testtable.xpath( '/ns0:tbl/ns0:tr[2]/ns0:tc[2]/ns0:p/ns0:r/ns0:t', namespaces={ 'ns0': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' })[0].text == 'B2'
def testmakeelement(): '''Ensure custom elements get created''' docx = Docx() testelement = docx._makeelement('testname', attributes={'testattribute': 'testvalue'}, tagtext='testtagtext') assert testelement.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testname' assert testelement.attrib == { '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testattribute': 'testvalue' } assert testelement.text == 'testtagtext'
def testopendocx(): '''Ensure an etree element is returned''' docx = Docx(TEST_FILE) if isinstance(docx._document, lxml.etree._Element): pass else: assert False
def from_word(self, file): try: # Docx paragraphs = Docx(file).paragraphs for paragraphs in paragraphs: if detect(paragraph) == 'en': self.paragraphs.append(Text(paragraphs)) except: try: # Doc string = subprocess.check_output(['antiword', '-t', file]) # Decode and split by paragraphs extracted_list = string.decode('utf-8').split('\n\n') for paragraph in extracted_list: if detect(paragraph) == 'en': self.paragraphs.append(Text(paragraph)) except: # If Antiword does not work, convert to txt subprocess.run(['textutil', '-convert', 'txt', file]) file = file.replace('.doc', '.txt') with open(file, 'r') as f: data = f.readlines() for paragraph in data: if detect(paragraph) == 'en': self.paragraphs.append(Text(paragraph))
def get_lines_from_source(self): """ Returns a list with the lines from the source """ extension = self.get_doc_file_extension() if extension in ('txt', ''): return tuple(line.decode('utf-8') for line in self.doc_file.readlines()) elif extension == 'docx': docx_document = Docx(BytesIO(self.doc_file.read())) return tuple(paragrah.text for paragrah in docx_document.paragraphs) elif extension == 'pdf': raise NotImplementedError() else: raise ValueError("file_format not supported")
def get_full_text_from_source(self): """ Returns the full text from the source """ extension = self.get_doc_file_extension() if extension in ('txt', ''): # string = unicode(string) return self.doc_file.read().decode("utf-8") elif extension == 'docx': docx_document = Docx(BytesIO(self.doc_file.read())) return "\n".join(p.text for p in docx_document.paragraphs) elif extension == 'pdf': raise NotImplementedError() else: raise ValueError("file_format not supported")
""" This file makes a .docx (Word 2007) file from scratch, showing off most of the features of python-docx. If you need to make documents from scratch, you can use this file as a basis for your work. Part of Python's docx module - http://github.com/mikemaccana/python-docx See LICENSE for licensing information. """ from docx import Docx if __name__ == '__main__': # Make a new document tree - this is the main part of a Word document docx = Docx() # Append two headings and a paragraph docx.heading("Welcome to Python's docx module", 1) docx.heading('Make and edit docx in 200 lines of pure Python', 2) docx.paragraph('The module was created when I was looking for a ' 'Python support for MS Word .doc files on PyPI and Stackoverflow. ' 'Unfortunately, the only solutions I could find used:') # Add a numbered list points = [ 'COM automation' , '.net or Java' , 'Automating OpenOffice or MS Office' ] for point in points: docx.paragraph(point, style='ListNumber')
If you need to extract text from documents, use this file as a basis for your work. Part of Python's docx module - http://github.com/mikemaccana/python-docx See LICENSE for licensing information. """ import sys from docx import Docx if __name__ == '__main__': docx = None try: docx = Docx(sys.argv[1]) newfile = open(sys.argv[2], 'w') except: print( "Please supply an input and output file. For example:\n" " example-extracttext.py 'My Office 2007 document.docx'" " 'outputfile.txt'" ) exit() # Fetch all the text out of the document we just created paratextlist = docx.getdocumenttext() # Make explicit unicode version newparatextlist = [] for paratext in paratextlist:
def simpledoc(): '''Make a docx (document, relationships) for use in other docx tests''' docx = Docx() docx.heading('Heading 1', 1) docx.heading('Heading 2', 2) docx.paragraph('Paragraph 1') for point in ['List Item 1', 'List Item 2', 'List Item 3']: docx.paragraph(point, style='ListNumber') docx.pagebreak(type='page') docx.paragraph('Paragraph 2') docx.table([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3'], ['C1', 'C2', 'C3']]) docx.pagebreak(type='section', orient='portrait') docx.picture(IMAGE1_FILE, 'This is a test description') docx.pagebreak(type='section', orient='landscape') docx.paragraph('Paragraph 3') return docx
def testparagraph(): '''Ensure paragraph creates p elements''' docx = Docx() testpara = docx.paragraph('paratext', style='BodyText') assert testpara.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p' pass
def testtextextraction(): '''Ensure text can be pulled out of a document''' docx = Docx(TEST_FILE) paratextlist = docx.getdocumenttext() assert len(paratextlist) > 0
If you need to extract text from documents, use this file as a basis for your work. Part of Python's docx module - http://github.com/mikemaccana/python-docx See LICENSE for licensing information. """ import sys from docx import Docx if __name__ == '__main__': docx = None try: docx = Docx(sys.argv[1]) newfile = open(sys.argv[2], 'w') except: print("Please supply an input and output file. For example:\n" " example-extracttext.py 'My Office 2007 document.docx'" " 'outputfile.txt'") exit() # Fetch all the text out of the document we just created paratextlist = docx.getdocumenttext() # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8"))