def get_text(fileDir): document = zipfile.ZipFile(fileDir) #xml_content = document.read('content.xml') #document.close() #xml = parse(document.) #xml = parse('inputText/content.xml') #print(document.filelist) #print(document.open('content.xml')) xml = parse(document.open('content.xml')) textSoup = BeautifulStoneSoup(document.read('content.xml')) #print(textSoup.prettify()) #print(textSoup.get_text()) document.close() """ officeText = xml.getElementsByTagName('office:text') textFromDoc = [] if len((officeText[0].childNodes)) != 0: for officeNode in officeText[0].childNodes: if len(officeNode.childNodes) != 0: for nextNode1 in officeNode.childNodes: if len(nextNode1.childNodes) == 0: if nextNode1.nodeValue == None: textFromDoc.append(' ') else: textFromDoc.append(nextNode1.nodeValue) else: for nextNode2 in nextNode1.childNodes: if len(nextNode2.childNodes) == 0: textFromDoc.append(nextNode2.nodeValue) """ #for node in text: #textFromDoc.append(getTextFromTag(node)) #print(getTextFromTag(node)) return textSoup.get_text()
def get_text(fileDir): document = zipfile.ZipFile(fileDir) textSoup = BeautifulStoneSoup(document.read('content.xml')) document.close() return textSoup.get_text()