def ODT(self, url): tempfile = TEMP_FOLDER + "temp." odtdest = tempfile + "odt" downloadFile(url, odtdest) os.system('unzip -q -d ' + TEMP_FOLDER + ' ' + odtdest) tree = etree.parse(TEMP_FOLDER + "content.xml") root = tree.getroot() text = self.getTextFromXML(root) return text
def PDF(self, url, enc='UTF-8'): tempfile = TEMP_FOLDER + "temp." pdfdest = tempfile + "pdf" txtdest = tempfile + "txt" downloadFile(url, pdfdest) os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest) txt = readfile(txtdest) txt = normalizePDF(txt) return txt
def PDF(self, url, enc = 'UTF-8'): tempfile = TEMP_FOLDER + "temp." pdfdest = tempfile + "pdf" txtdest = tempfile + "txt" downloadFile(url, pdfdest) os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest) txt = readfile(txtdest) txt = normalizePDF(txt) return txt