def parseDocx(inDoc): with open('temp/temp.txt', 'w+') as temp: # print("parseDocx is being called") import zipfile try: from xml.etree.cElementTree import XML except ImportError: from xml.etree.ElementTree import XML print("Running in compatibility mode") """ parseDocx is a derivative of <https://github.com/mickmaccana/python-docx> """ WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' document = zipfile.ZipFile(inDoc) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) i = 0 paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) temp.write(repr(paragraphs)) return paragraphs ### this should be a list of all the stuf
def get_raw_text(pthFile): """ gets a path to a file as an argument and returns a list containing the paragraphs of the word document file """ """ Constants used to iterate over the XML tree """ WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' docWordDoc = zipfile.ZipFile(pthFile) #gets the documents of the word xmlContent = docWordDoc.read('word/document.xml') #access the xml file docWordDoc.close() treeXML = XML(xmlContent) #parses the xml content into a tree that will be further used to access the text lstParagraphs = [] #output list with the paragraphs of the text #now we proceed to extract the text from the tree #the idea is to iterate over the tree and #for each node that contains text, substract it and add it to #the output for parParagraph in treeXML.getiterator(PARA): lstTexts = [nodElement.text for nodElement in parParagraph.getiterator(TEXT) if nodElement.text] if lstTexts: print lstTexts lstParagraphs.append(''.join(lstTexts)) return lstParagraphs
def get_docx_text(path=os.getcwd() + '\\word_samples'): """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(path) contentToRead = ["header2.xml", "document.xml", "footer2.xml"] paragraphs = [] for xmlfile in contentToRead: xml_content = document.read('word/{}'.format(xmlfile)) tree = XML(xml_content) for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: textData = ''.join(texts) if xmlfile == "footer2.xml": extractedTxt = "Footer : " + textData elif xmlfile == "header2.xml": extractedTxt = "Header : " + textData else: extractedTxt = textData paragraphs.append(extractedTxt) document.close() return '\n\n'.join(paragraphs)
def docxml_to_text(self, filename): texts = "" document = zipfile.ZipFile(filename) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) sections = [] for section in tree.getiterator(self.PARA): texts = '' for node in section.getiterator(self.TEXT): if node.text: texts += node.text sections.append(''.join(texts)) ''' for section in tree.getiterator(self.PARA): texts = [node.text for node in section.getiterator(self.TEXT) if node.text] if texts: sections.append(''.join(texts)) ''' texts = '\n\n'.join(sections) return texts
def parse_config(soup): """ There are lots of goodies in the config we get back from the ABC. In particular, it gives us the URLs of all the other XML data we need. """ xml = XML(soup) params = dict() for param in xml.getiterator('param'): params.setdefault(param.get('name'), param.get('value')) # should look like "rtmp://cp53909.edgefcs.net/ondemand" # Looks like the ABC don't always include this field. # If not included, that's okay -- ABC usually gives us the server in the auth result as well. rtmp_url = params['server_streaming'] categories_url = params['categories'] params.update({ 'rtmp_url': rtmp_url, 'auth_url': params['auth'], 'api_url': params['api'], 'categories_url': categories_url, 'captions_url': params['captions'], }) return params
def docx_extractor(path, vectors=False): document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') ## GET METADATA # use lxml to parse the xml file we are interested in try: doc = lxml.etree.fromstring(document.read('docProps/core.xml')) # retrieve creator ns = {'dc': 'http://purl.org/dc/elements/1.1/'} creator = doc.xpath('//dc:creator', namespaces=ns)[0].text except: creator = "Unknown" document.close() tree = XML(xml_content) doc = {} vector = {} paragraph_nb = 1 for paragraph in tree.getiterator(PARA): texts = None text = "" texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: text = ''.join(texts) doc[str(paragraph_nb)] = fix_text(text) if vectors: vector[str(paragraph_nb)] = vectorizer(text, lang=detect(text)) paragraph_nb += 1 if vectors: return creator, doc, vector else: return creator, doc
def epilepsy_docx_xml_to_txt( path, n_xml, docx_xml_to_txt_save_path="L:\\word_docs\\epilepsy_docx_xml_to_txt\\"): """ Take the path of a docx file as argument, return the text in unicode. Run this if epilepsy_docx() isn't able to read the name. This should automatically read tables anyway. """ WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [n.text for n in paragraph.getiterator(TEXT) if n.text] if texts: paragraphs.append(''.join(texts)) pt_txt_xml = '\n\n'.join(paragraphs) save_as_txt(path, pt_txt_xml, docx_xml_to_txt_save_path) n_xml += 1 return pt_txt_xml, n_xml
def parse_captions(soup): """ Converts custom iView captions into SRT format, usable in most decent media players. """ # Horrible hack to escape literal ampersands, which have been seen in # some captions XML. Inspired by # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python if b"<![CDATA[" not in soup: # Not seen, but be future proof soup = re.sub(b"&(?![#\w]+;)", b"&", soup) xml = XML(soup) output = '' i = 1 for title in xml.getiterator('title'): start = title.get('start') ids = start.rfind(':') end = title.get('end') ide = end.rfind(':') output = output + str(i) + '\n' output = output + start[:ids] + ',' + start[ids+1:] + ' --> ' + end[:ide] + ',' + end[ide+1:] + '\n' output = output + title.text.replace('|','\n') + '\n\n' i += 1 return output
def get_docx_table(path): """ Find the table inside the .docx file and return it in an array """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) rows = [] for xml_row in tree.getiterator(TR): row = [] for xml_cell in xml_row.getiterator(TC): # Each cell consists of one or more paragraph text = "" for paragraph in xml_cell.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] paragraph_text = "".join(texts) if paragraph_text: text += paragraph_text + "\n" if text.endswith("\n"): text = text[0:-1] row.append(text) rows.append(row) return rows
def parse_captions(soup): """ Converts custom iView captions into SRT format, usable in most decent media players. """ # Horrible hack to escape literal ampersands, which have been seen in # some captions XML. Inspired by # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python if b"<![CDATA[" not in soup: # Not seen, but be future proof soup = re.sub(b"&(?![#\w]+;)", b"&", soup) xml = XML(soup) output = '' i = 1 for title in xml.getiterator('title'): start = title.get('start') (start, startfract) = start.rsplit(':', 1) end = title.get('end') (end, endfract) = end.rsplit(':', 1) output = output + '{}\n'.format(i) output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(start, startfract, end, endfract) output = output + title.text.replace('|','\n') + '\n\n' i += 1 return output
def get_email_para(path): document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) text = '' for paragraph in paragraphs: text += ' ' + paragraph #print (text) result = '' reresult = False match = re.findall(r'[\w\.-]+@[\w\.-]+', text) for paragraph in paragraphs: if match[0] in paragraph: result = (paragraph.split(match[0])[0]) #print (result) document = Document(path) paras = list(document.paragraphs) for paragraph in paras: if result in paragraph.text: paragraph.style = 'Correspondence' document.save(file.split(".")[0] + "_PROCESSED.docx") reresult = True return reresult
def parse_config(soup): """ There are lots of goodies in the config we get back from the ABC. In particular, it gives us the URLs of all the other XML data we need. """ xml = XML(soup) params = dict() for param in xml.getiterator('param'): params.setdefault(param.get('name'), param.get('value')) # should look like "rtmp://cp53909.edgefcs.net/ondemand" # Looks like the ABC don't always include this field. # If not included, that's okay -- ABC usually gives us the server in the auth result as well. rtmp_url = params['server_streaming'] rtmp_chunks = rtmp_url.split('/') return { 'rtmp_url' : rtmp_url, 'rtmp_host' : rtmp_chunks[2], 'rtmp_app' : rtmp_chunks[3], 'auth_url' : params['auth'], 'api_url' : params['api'], 'categories_url' : params['categories'], 'captions_url' : params['captions'], }
def parse_captions(soup): """ Converts custom iView captions into SRT format, usable in most decent media players. """ # Horrible hack to escape literal ampersands, which have been seen in # some captions XML. Inspired by # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python if b"<![CDATA[" not in soup: # Not seen, but be future proof soup = re.sub(b"&(?![#\w]+;)", b"&", soup) xml = XML(soup) output = '' i = 1 for title in xml.getiterator('title'): start = title.get('start') (start, startfract) = start.rsplit(':', 1) end = title.get('end') (end, endfract) = end.rsplit(':', 1) output = output + '{}\n'.format(i) output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format( start, startfract, end, endfract) output = output + title.text.replace('|', '\n') + '\n\n' i += 1 return output
def get_raw_text(pthFile): """ gets a path to a file as an argument and returns a list containing the paragraphs of the word document file """ """ Constants used to iterate over the XML tree """ WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' docWordDoc = zipfile.ZipFile(pthFile) #gets the documents of the word xmlContent = docWordDoc.read('word/document.xml') #access the xml file docWordDoc.close() treeXML = XML( xmlContent ) #parses the xml content into a tree that will be further used to access the text lstParagraphs = [] #output list with the paragraphs of the text #now we proceed to extract the text from the tree #the idea is to iterate over the tree and #for each node that contains text, substract it and add it to #the output for parParagraph in treeXML.getiterator(PARA): lstTexts = [ nodElement.text for nodElement in parParagraph.getiterator(TEXT) if nodElement.text ] if lstTexts: print lstTexts lstParagraphs.append(''.join(lstTexts)) return lstParagraphs
def get_docx_text(path): """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) # f = open("data.xml", "w") # f.write(xml_content) tre = ET.parse('data.xml') root = tre.getroot() for child in root: print child.tag, child.attrib paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) return '----\n\n'.join(paragraphs)
def docx_to_text(document_path, event_handler): global logger from docx import Document from docx.table import Table from docx.text.paragraph import Paragraph from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P try: doc = Document(document_path) doc_body = doc.element.body blocks = [] for child in doc_body.iterchildren(): if isinstance(child, CT_P): blocks.append(Paragraph(child, doc_body).text) elif isinstance(child, CT_Tbl): blocks.append('\n'.join( ' | '.join(cell.text for cell in row.cells) for row in Table(child, doc_body).rows)) #end for text = '\n\n'.join(blocks).strip() return text except Exception: logger.exception('Exception while parsing <{}>.'.format( event_handler.key)) #end try # Extract it from the XML with ZipFile(document_path) as document_zipfile: xml_content = document_zipfile.read('word/document.xml') try: from xml.etree.cElementTree import XML except ImportError: from xml.etree.ElementTree import XML tree = XML(xml_content) DOCX_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' DOCX_PARA = DOCX_NAMESPACE + 'p' DOCX_TEXT = DOCX_NAMESPACE + 't' paragraphs = [] for paragraph in tree.getiterator(DOCX_PARA): texts = [ node.text for node in paragraph.getiterator(DOCX_TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) #end for text = '\n\n'.join(paragraphs) return text
def getText(self, path): """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) paragraphs.append('TABLE_INFORMATION_ROW_WISE') for tablerow in tree.getiterator(TABLE_ROW): texts = [ node.text for node in tablerow.getiterator(TEXT) if node.text ] if texts: paragraphs.append(' '.join(texts)) paragraphs.append('INFORMATION_EXTRACTED_FROM_HEADER') try: xml_content = document.read('word/header1.xml') document.close() tree = XML(xml_content) for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) except Exception: document.close() pass return '\n\n'.join(paragraphs)
def get_docx_tables(path): """ Find the table inside the .docx file and return it in an array """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) for tbl in tree.getiterator(TBL): yield tbl
def get_docx_text(self, path): document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') tree = XML(xml_content) document.close() paragraphs = [] for paragraph in tree.getiterator(para_tag): texts = [node.text for node in paragraph.getiterator(text_tag) if node.text] if texts: paragraphs.append(''.join(texts)) return paragraphs
def get_docx_text(path): document = zipfile.ZipFile(path) xml_content = document.read("word/document.xml") document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append("".join(texts)) return paragraphs
def get_docx_text(path): """ Take the path of a docx file as argument, return the text in unicode in the form of a list. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text.encode('utf-8') for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return paragraphs # def get_docx_text(path): # """ # Take the path of a docx file as argument, return the text in unicode # in the form of a list. # """ # document = zipfile.ZipFile(path) # xml_content = document.read('word/document.xml') # document.close() # tree = XML(xml_content) # sections = [] # for section in tree.getiterator(SECT): # paragraphs = [] # for paragraph in section.getiterator(PARA): # print 'para' # texts = [node.text.encode('utf-8') # for node in paragraph.getiterator(TEXT) # if node.text] # if texts: # paragraphs.append(''.join(texts)) # print str(paragraphs) # if paragraphs: # sections.append(''.join(paragraphs)) # return sections
def returnFileText(self, xmlHeader='word/document.xml'): ''' Returns the file's text in a large string Usage: returnFileText() Returns: 'Lorem ipsum ....' ''' if self.fileFormat == 'pdf': fp = open(self.filePathName, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) file = retstr.getvalue() elif self.fileFormat == 'doc': file = open(self.filePathName, encoding='latin-1').read() elif self.fileFormat == 'docx': WORD_NAMESPACE = \ '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' document = zipfile.ZipFile(self.filePathName) xml_content = document.read(xmlHeader) document.close() try: tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) except: return Exception file = ''' '''.join(paragraphs) return file
def get_docx_text(path): document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def get_doc_text(path): """ Take the path of a docx or a dot file as argument, return the text in unicode. """ if "docx" == path[-4:]: document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) #print tree paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) pass pass #print paragraphs return paragraphs # elif "odt" == path[-3:]: document = zipfile.ZipFile(path) xml_content = document.read('content.xml') document.close() doc = xml.dom.minidom.parseString(xml_content) print(" doc: ", doc) print("doc::end") #paras = doc.getElementsByTagName('text:span') #paras = doc.getElementsByTagName('text:p') # # we get here all elements Headers, text and table components: # paras = doc.getElementsByTagName("*") print("I have ", len(paras), " paragraphs ") paragraphs = [] for p in paras: for ch in p.childNodes: if ch.nodeType == ch.TEXT_NODE: paragraphs.append(''.join(ch.wholeText)) pass pass pass print(paragraphs) return paragraphs else: print() raise Warning("only docx and odt files are handled")
def extract_docx_content(): """ Extract text content from .docx file. The .docx file type is in xml format, so this function makes use of the xml.etree.cElementTree python module. :return: pandas DataFrame """ # - pu claims data directory path_input = 'Z:\\xxxx' # - Namespace information needed to extract content WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' # - create a dict to store processed docx file processed_docx_files = {} os.chdir(path_input) # - collect al .docx files docx_files = glob.glob('*.docx') # - files to remove remove_list = [] # - algorithm to extract the .docx data for docx in docx_files: try: document = zipfile.ZipFile(docx) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) # - load data into paragraphs list paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] # - convert texts into a single text string if texts: paragraphs.append(' '.join(texts).lower()) # - update the procesed_docx_files dict processed_docx_files.update({docx: ''.join(paragraphs)}) except: print("An error occured trying to parse {}".format(docx)) remove_list.append(docx)
def read_docx(file,document,path,trie): xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = "" for paragraph in tree.getiterator(PARA): texts="" for node in paragraph.getiterator(TEXT): if node.text: texts += node.text.replace('\u7460',"") if texts: paragraphs+=str(texts) #print(paragraphs) string_spilt(paragraphs,path,trie) trie.insert_doc_len(path,len(file)+len(paragraphs))
def parse(self, path_to_document): document = zipfile.ZipFile(path_to_document) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) self.text = ''.join(paragraphs)
def get_doc_text(path): """ Take the path of a docx or a dot file as argument, return the text in unicode. """ if "docx" == path[-4:]: document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) #print tree paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) pass pass #print paragraphs return paragraphs # elif "odt" == path[-3:]: document = zipfile.ZipFile(path) xml_content = document.read('content.xml') document.close() doc = xml.dom.minidom.parseString(xml_content) print(" doc: ",doc) print("doc::end") #paras = doc.getElementsByTagName('text:span') #paras = doc.getElementsByTagName('text:p') # # we get here all elements Headers, text and table components: # paras = doc.getElementsByTagName("*") print("I have ", len(paras), " paragraphs ") paragraphs = [] for p in paras: for ch in p.childNodes: if ch.nodeType == ch.TEXT_NODE: paragraphs.append(''.join(ch.wholeText)) pass pass pass print(paragraphs) return paragraphs else: print() raise Warning("only docx and odt files are handled")
def docx_do_docx(azip, afile): word_namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" par = word_namespace + "p" txt = word_namespace + "t" xml_content = azip.read("word/document.xml") tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(par): texts = [node.text for node in paragraph.getiterator(txt) if node.text] if texts: paragraphs.append("".join(texts)) text = "\n\n".join(paragraphs) text_do_data(text, afile)
def get_docx_text(path): """ Take the path of a docx file as argument, return the text in unicode.""" document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def docx_do_docx(azip, afile): namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' par = namespace + 'p' txt = namespace + 't' xml_content = azip.read('word/document.xml') tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(par): texts = [node.text for node in paragraph.getiterator(txt) if node.text] if texts: paragraphs.append(''.join(texts)) text = '\n\n'.join(paragraphs) text_do_data(text, afile)
def get_docx_text(file_path): """Take the path of a docx file as argument, return the text in unicode.""" document = zipfile.ZipFile(file_path) xml_content = document.read("word/document.xml") document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append("".join(texts)) # Type String return "\n\n".join(paragraphs)
def __init__(self,filePath): document = zipfile.ZipFile(filePath) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) for paragraph in tree.getiterator(self.PARA): texts = [node.text for node in paragraph.getiterator(self.TEXT) if node.text] if texts: self.paragraphs.append(''.join(texts)) sentenceEnders = re.compile('[.!?][\s]{1,2}(?=[A-Z])') self.sentenceList = self.sentenceList + sentenceEnders.split(''.join(texts)) self.data = '\n\n'.join(self.paragraphs) self.words = re.findall(r"[\w']+", self.data) self.filteredData = ' '.join(self.words)
def readWord(filename): try: document = zipfile.ZipFile(filename) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = '' for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs += str(texts) + ',' return paragraphs except Exception as e: print('ReadWord exception', e)
def docx_do_docx(azip, afile): namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' par = namespace + 'p' txt = namespace + 't' xml_content = azip.read('word/document.xml') tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(par): texts = [node.text for node in paragraph.getiterator(txt) if node.text] if texts: paragraphs.append(''.join(texts)) text = '\n\n'.join(paragraphs) text_do_data(text, afile)
def parse_docx(path, paragraphs): """ Take the path of a docx file as argument and list 'paragraphs', return the text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def getContents(self): """ Just read the paragraphs from an XML file. """ xml_content = self.my_docx.read('word/document.xml') self.my_docx.close() tree = XML(xml_content) self.text_in_paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: self.text_in_paragraphs.append(''.join(texts)) pass pass #print paragraphs return self.text_in_paragraphs
def docxparse(inDocx, outDocx): with open(outDocx, "w+") as outDocx: #Take the path of a docx file as argument, return the text in unicode. document = zipfile.ZipFile(inDocx) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) i = 0 paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) outDocx.write('\n'.join(paragraphs))
def get_docx_text(fileName): zipFile = zipfile.ZipFile(fileName) WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' name = 'word/document.xml' document = zipFile.read(name, pwd=None) documentTree = XML(document) paragraphs = [] for paragraph in documentTree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) return ('\n\n'.join(paragraphs))
def get_docx_text(path): WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def getContents(self): """ Just read the paragraphs from an XML file. """ xml_content = self.my_docx.read('word/document.xml') self.my_docx.close() tree = XML(xml_content) self.text_in_paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: self.text_in_paragraphs.append(''.join(texts)) pass pass #print paragraphs return self.text_in_paragraphs
def get_docx_text(fileName): """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(fileName) xml_content = document.read('word/document.xml') print('xml_content', xml_content) document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def parse_captions(soup): """ Converts custom iView captions into SRT format, usable in most decent media players. """ xml = XML(soup) output = '' i = 1 for title in xml.getiterator('title'): start = title.get('start') ids = start.rfind(':') end = title.get('end') ide = end.rfind(':') output = output + str(i) + '\n' output = output + start[:ids] + ',' + start[ids+1:] + ' --> ' + end[:ide] + ',' + end[ide+1:] + '\n' output = output + title.text.replace('|','\n') + '\n\n' i += 1 return output
def get_docx_text(path, options): #Take the path of a docx file as argument, return the text in unicode. document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] para = [] for paragraph in tree.getiterator(PARA): alltext = [node.text for node in paragraph.getiterator(TEXT) if node.text] for bullet in paragraph.getiterator(BULLETNUM): if len(para) > 0: paragraphs.append(''.join(para)) para = [] para.append(''.join(alltext)) paragraphs.append(''.join(para)) #for the last para to be appended return '\n\n'.join(paragraphs), [paragraphs[i:i+options+1] for i in range(0, len(paragraphs), options+1)]
def get_docx_text(path): """ Take the path of a docx file as argument, return the text in unicode. """ WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def get_docx_text(path): """ Take the path of a docx file as argument, return the text in unicode. """ WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' # formatting for docx PARA = WORD_NAMESPACE + 'p' # formatting for paragraphs TEXT = WORD_NAMESPACE + 't' # formatting for text document = zipfile.ZipFile(path) # the unzipped document path xml_content = document.read('word/document.xml') # location of the primary xml document document.close() # closes the document tree = XML(xml_content) # splits the xl into a tree paragraphs = [] # a list of the paragraphs for paragraph in tree.getiterator(PARA): # for every new paragraph in the tree texts = [node.text # the text is the text node in the tree for node in paragraph.getiterator(TEXT) # if node.text] # if the node is text, add it to the text list if texts: # if a text is found, paragraphs.append(''.join(texts)) # add it to the paragraphs list #return('\n\n'.join(paragraphs)) return(paragraphs) # return the paragra
def _docx_to_txt(path): """ Take the path of a docx file as argument, return the text in unicode. """ try: document = zipfile.ZipFile(path) except BadZipFile as e: print (path) print (" is not a valid zip file") raise e xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def prepare_docx(self,source): """Captura el Docx, extrae textos, arregla palabras partidas y divide por párrafos/líneas para enviar a preprocess""" document = zipfile.ZipFile(source) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) output = ''+str(source[:-5])+'_txt.txt' t = open(output,'wb') paragraphs = [] for paragraph in tree.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if (texts!=[] and texts and texts!=" " and texts !=""): t.write(self.pre_process(''.join(texts)).encode('utf-8')) t.write(b'\n') else: t.write(b'\n') t.close() return output
def get_docx_comments(path, anonymous): """ Take the path of a docx file as argument, return the comments text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read(r'word/comments.xml') document.close() tree = XML(xml_content) paragraphs = [] for comment in tree.getiterator(COMMENT): id = int(comment.attrib[ID]) + 1 # Bump ID number cuz people don't like zero. dt = comment.attrib[DATE] author = comment.attrib[AUTHOR] for paragraph in comment.getiterator(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append('{0}. Author: {1} Date:{2}'.format(id, author, dt)) paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)