def content_index(content, filename=None, content_type=None): fname, ext = os.path.splitext(filename) result = '' if ext == '.doc': #or content_type ? (stdin, stdout) = os.popen2('antiword -', 'b') stdin.write(content) stdin.close() result = stdout.read().decode('latin1', 'replace').encode('utf-8', 'replace') elif ext == '.pdf': fname = os.tempnam(filename) + '.pdf' fp = file(fname, 'wb') fp.write(content) fp.close() fp = os.popen('pdftotext -enc UTF-8 -nopgbrk ' + fname + ' -', 'r') result = fp.read() fp.close() elif ext == '.odt': s = StringIO.StringIO(content) o = odt2txt.OpenDocumentTextFile(s) result = o.toString().encode('ascii', 'replace') elif ext in ('.txt', '.py', '.patch', '.html', '.csv'): result = content return result
def _doIndexContent(self, content): s = StringIO.StringIO(content) o = odt2txt.OpenDocumentTextFile(s) result = _to_unicode(o.toString()) s.close() return result
import os import odt2txt import re to_find = "nul" occurences = {} for folder, subfolders, files in os.walk("./"): for file in files: if file.endswith(".odt"): print(file) fn = os.path.join(folder, file) odt = odt2txt.OpenDocumentTextFile(fn) unicode = odt.toString() occurences[fn] = [m.start() for m in re.finditer(to_find, unicode)]
def ParseODT(doc): odt = odt2txt.OpenDocumentTextFile(doc) unicode = odt.toString() out_utf8 = unicode.encode("utf-8") return out_utf8