Exemplo n.º 1
0
 def ODT(self, url):
     tempfile = TEMP_FOLDER + "temp."
     odtdest = tempfile + "odt"
     downloadFile(url, odtdest)
     os.system('unzip  -q -d ' + TEMP_FOLDER + ' ' + odtdest)
     tree = etree.parse(TEMP_FOLDER + "content.xml")
     root = tree.getroot()
     text = self.getTextFromXML(root)
     return text
Exemplo n.º 2
0
	def ODT(self, url):
		tempfile = TEMP_FOLDER + "temp."
		odtdest = tempfile + "odt"
		downloadFile(url, odtdest)
		os.system('unzip  -q -d ' + TEMP_FOLDER + ' ' + odtdest)
		tree = etree.parse(TEMP_FOLDER + "content.xml")
		root = tree.getroot() 
		text = self.getTextFromXML(root)
		return text
Exemplo n.º 3
0
 def PDF(self, url, enc='UTF-8'):
     tempfile = TEMP_FOLDER + "temp."
     pdfdest = tempfile + "pdf"
     txtdest = tempfile + "txt"
     downloadFile(url, pdfdest)
     os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest)
     txt = readfile(txtdest)
     txt = normalizePDF(txt)
     return txt
Exemplo n.º 4
0
	def PDF(self, url, enc = 'UTF-8'):
		tempfile = TEMP_FOLDER + "temp."
		pdfdest = tempfile + "pdf"
		txtdest = tempfile + "txt"
		downloadFile(url, pdfdest)
		os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest)
		txt = readfile(txtdest)
		txt = normalizePDF(txt)
		return txt