Exemplo n.º 1
0
def content_index(content, filename=None, content_type=None):
    fname, ext = os.path.splitext(filename)
    result = ''
    if ext == '.doc':  #or content_type ?
        (stdin, stdout) = os.popen2('antiword -', 'b')
        stdin.write(content)
        stdin.close()
        result = stdout.read().decode('latin1',
                                      'replace').encode('utf-8', 'replace')
    elif ext == '.pdf':
        fname = os.tempnam(filename) + '.pdf'
        fp = file(fname, 'wb')
        fp.write(content)
        fp.close()
        fp = os.popen('pdftotext -enc UTF-8 -nopgbrk ' + fname + ' -', 'r')
        result = fp.read()
        fp.close()
    elif ext == '.odt':
        s = StringIO.StringIO(content)
        o = odt2txt.OpenDocumentTextFile(s)
        result = o.toString().encode('ascii', 'replace')
    elif ext in ('.txt', '.py', '.patch', '.html', '.csv'):
        result = content
    return result
Exemplo n.º 2
0
 def _doIndexContent(self, content):
     s = StringIO.StringIO(content)
     o = odt2txt.OpenDocumentTextFile(s)
     result = _to_unicode(o.toString())
     s.close()
     return result
Exemplo n.º 3
0
import os
import odt2txt
import re

to_find = "nul"
occurences = {}
for folder, subfolders, files in os.walk("./"):
    for file in files:
        if file.endswith(".odt"):
            print(file)
            fn = os.path.join(folder, file)
            odt = odt2txt.OpenDocumentTextFile(fn)
            unicode = odt.toString()
            occurences[fn] = [m.start() for m in re.finditer(to_find, unicode)]


Exemplo n.º 4
0
def ParseODT(doc):
    odt = odt2txt.OpenDocumentTextFile(doc)
    unicode = odt.toString()
    out_utf8 = unicode.encode("utf-8")
    return out_utf8