def html_text(self, fn): f = open(fn, 'rb') zip = ZipFile(f) docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \ b'content="text/html; charset=UTF-8">' # Wrap metadata extraction because it can sometimes throw # while the main text will be valid try: metadata = zip.read("meta.xml") if metadata: res = rclxslt.apply_sheet_data(stylesheet_meta, metadata) docdata += res except: # To be checked. I'm under the impression that I get this when # nothing matches? #self.em.rclog("No/bad metadata in %s" % fn) pass docdata += b'</head>\n<body>\n' content = zip.read("content.xml") if content: res = rclxslt.apply_sheet_data(stylesheet_content, content) docdata += res docdata += b'</body></html>' return docdata
def extractone(self, params): if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] try: f = open(fn, 'rb') zip = ZipFile(f) except Exception as err: self.em.rclog("unzip failed: " + str(err)) return (False, "", "", rclexecm.RclExecM.eofnow) docdata = b'<html><head>' try: metadata = zip.read("docProps/core.xml") if metadata: res = rclxslt.apply_sheet_data(meta_stylesheet, metadata) docdata += res except Exception as err: # To be checked. I'm under the impression that I get this when # nothing matches? self.em.rclog("no/bad metadata in %s: %s" % (fn, err)) pass docdata += b'</head><body>' try: content= zip.read('word/document.xml') stl = self.computestylesheet('word') docdata += rclxslt.apply_sheet_data(stl, content) except: pass try: content = zip.read('xl/sharedStrings.xml') stl = self.computestylesheet('xl') docdata += rclxslt.apply_sheet_data(stl, content) except: pass try: stl = self.computestylesheet('pp') # Note that we'd need a numeric sort really (else we get slide1 # slide11 slide2) for fn in sorted(zip.namelist()): if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'): content = zip.read(fn) docdata += rclxslt.apply_sheet_data(stl, content) except: pass docdata += b'</body></html>' return (True, docdata, "", rclexecm.RclExecM.eofnext)
def html_text(self, fn): f = open(fn, 'rb') zip = ZipFile(f) docdata = b'<html><head>' try: metadata = zip.read("docProps/core.xml") if metadata: res = rclxslt.apply_sheet_data(meta_stylesheet, metadata) docdata += res except Exception as err: pass docdata += b'</head><body>' try: content = zip.read('word/document.xml') stl = self.computestylesheet('word') docdata += rclxslt.apply_sheet_data(stl, content) except: pass try: content = zip.read('xl/sharedStrings.xml') stl = self.computestylesheet('xl') docdata += rclxslt.apply_sheet_data(stl, content) except: pass try: stl = self.computestylesheet('pp') # Note that we'd need a numeric sort really (else we get slide1 # slide11 slide2) for fn in sorted(zip.namelist()): if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'): content = zip.read(fn) docdata += rclxslt.apply_sheet_data(stl, content) except: pass docdata += b'</body></html>' return docdata
def extractone(self, params): if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] try: zip = ZipFile(fn.decode('UTF-8')) except Exception as err: self.em.rclog("unzip failed: %s" % err) return (False, "", "", rclexecm.RclExecM.eofnow) docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \ b'content="text/html; charset=UTF-8">' try: metadata = zip.read("meta.xml") if metadata: res = rclxslt.apply_sheet_data(stylesheet_meta, metadata) docdata += res except: # To be checked. I'm under the impression that I get this when # nothing matches? #self.em.rclog("no/bad metadata in %s" % fn) pass docdata += b'</head>\n<body>\n' try: content = zip.read("content.xml") if content: res = rclxslt.apply_sheet_data(stylesheet_content, content) docdata += res docdata += b'</body></html>' except Exception as err: self.em.rclog("bad data in %s: %s" % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) return (True, docdata, "", rclexecm.RclExecM.eofnext)
def extractone(self, params): if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] try: data = open(fn, 'rb').read() docdata = rclxslt.apply_sheet_data(stylesheet_all, data) except Exception as err: self.em.rclog("%s: bad data: " % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) return (True, docdata, "", rclexecm.RclExecM.eofnext)
def extractone(self, params): if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] try: f = open(fn, 'rb') data = f.read() f.close() except Exception as err: self.em.rclog("open failed: %s" % err) return (False, "", "", rclexecm.RclExecM.eofnow) docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \ b'content="text/html; charset=UTF-8">\n' try: res = rclxslt.apply_sheet_data(stylesheet_meta, data) docdata += res except: # To be checked. I'm under the impression that I get this when # nothing matches? #self.em.rclog("no/bad metadata in %s" % fn) pass docdata += b'</head><body>' try: res = rclxslt.apply_sheet_data(stylesheet_content, data) docdata += res docdata += b'</body></html>' except Exception as err: self.em.rclog("bad data in %s: %s" % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) return (True, docdata, "", rclexecm.RclExecM.eofnext)
def extractone(self, params): if "filename:" not in params: self.em.rclog("extractone: no mime or file name") return (False, "", "", rclexecm.RclExecM.eofnow) fn = params["filename:"] try: zip = ZipFile(fn.decode('UTF-8')) except Exception as err: self.em.rclog("unzip failed: %s" % err) return (False, "", "", rclexecm.RclExecM.eofnow) docdata = b'<html><head><meta http-equiv="Content-Type"' \ b'content="text/html; charset=UTF-8"></head><body>' try: metadata = zip.read("meta.xml") if metadata: res = rclxslt.apply_sheet_data(stylesheet_meta, metadata) docdata += res except: # To be checked. I'm under the impression that I get this when # nothing matches? #self.em.rclog("no/bad metadata in %s" % fn) pass try: content = zip.read("content.xml") if content: res = rclxslt.apply_sheet_data(stylesheet_content, content) docdata += res docdata += b'</body></html>' except Exception as err: self.em.rclog("bad data in %s: %s" % (fn, err)) return (False, "", "", rclexecm.RclExecM.eofnow) return (True, docdata, "", rclexecm.RclExecM.eofnext)
def html_text(self, fn): f = open(fn, 'rb') zip = ZipFile(f) docdata = b'<html><head>' try: metadata = zip.read("docProps/core.xml") if metadata: res = rclxslt.apply_sheet_data(meta_stylesheet, metadata) docdata += res except Exception as err: pass docdata += b'</head><body>' try: content = zip.read('word/document.xml') stl = self.computestylesheet('word') docdata += rclxslt.apply_sheet_data(stl, content) except: pass try: content = zip.read('xl/sharedStrings.xml') stl = self.computestylesheet('xl') docdata += rclxslt.apply_sheet_data(stl, content) except: pass try: stl = None # Extract number suffix for numeric sort prefix = "ppt/slides/slide" exp = prefix + '[0-9]+' + '.xml' names = [fn for fn in zip.namelist() if re.match(exp, fn)] for fn in sorted(names, key=lambda e, prefix=prefix: int(e[len(prefix): len(e) - 4])): if stl is None: stl = self.computestylesheet('pp') content = zip.read(fn) docdata += rclxslt.apply_sheet_data(stl, content) except Exception as ex: #self.em.rclog("PPT Exception: %s" % ex) pass try: stl = None # Extract number suffix for numeric sort prefix = 'visio/pages/page' exp = prefix + '[0-9]+' + '.xml' names = [fn for fn in zip.namelist() if re.match(exp, fn)] for fn in sorted(names, key=lambda e, prefix=prefix: int(e[len(prefix): len(e) - 4])): if stl is None: stl = self.computestylesheet('vs') content = zip.read(fn) docdata += rclxslt.apply_sheet_data(stl, content) except Exception as ex: #self.em.rclog("VISIO Exception: %s" % ex) pass docdata += b'</body></html>' return docdata
def html_text(self, fn): if self.dogz: data = gzip.open(fn, 'rb').read() else: data = open(fn, 'rb').read() return rclxslt.apply_sheet_data(self.stylesheet, data)