def html_text(self, fn): # hwp wants str filenames. This is unfortunate fn = fn.decode('utf-8') try: hwpfile = fs_Hwp5File(fn) except Exception as ex: self.em.rclog("hwpfile open failed: %s" % ex) raise ex try: tt = hwpfile.summaryinfo.title.strip() if tt: tt = rclexecm.htmlescape(tt.encode('utf-8')) self.em.setfield('caption', tt) for k, v in metafields(hwpfile.summaryinfo): v = "{0}".format(v) v = v.strip() if v: v = rclexecm.htmlescape(v.encode('utf-8')) k = k.encode('utf-8') self.em.setfield(k, v) except Exception as e: self.em.rclog("Exception: %s" % e) finally: hwpfile.close() # The first version of this file used conversion to text using # the hwp5 module (no subproc). But this apparently mishandled # tables. Switched to executing hwp5html instead. See 1st git # version for the old approach. return rclexecm.execPythonScript(["hwp5html", "--html", fn])
def takeLine(self, line): if not self.gotdata: if line == b'': return self.out.append(b'<html><head><title></title>' + \ b'<meta http-equiv="Content-Type"' + \ b'content="text/html;charset=UTF-8">' + \ b'</head><body><p>') self.gotdata = True if self.cont: line = self.cont + line self.cont = "" if line == b'\f': self.out.append('</p><hr><p>') return if self.patcont.search(line): # Break at last whitespace match = self.patws.search(line) if match: self.cont = line[match.start(2):match.end(2)] line = line[0:match.start(1)] else: self.cont = line line = b'' if line: self.out.append(rclexecm.htmlescape(line) + b'<br>') else: self.out.append(b'<br>')
def html_text(self, fn): # No charset, so recoll will have to use its config to guess it html = b'<html><head><title></title></head><body><pre>' with open(fn, "rb") as f: html += rclexecm.htmlescape(f.read()) html += b'</pre></body></html>' return html
def _fixhtml(self, input): #print input inheader = False inbody = False didcs = False output = [] isempty = True for line in input.split(b'\n'): if re.search(b'</head>', line): inheader = False if re.search(b'</pre>', line): inbody = False if inheader: if not didcs: output.append(b'<meta http-equiv="Content-Type"' + \ b'content="text/html; charset=UTF-8">\n') didcs = True if self.needescape: m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line) if not m: m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line) if m: line = m.group(1) + rclexecm.htmlescape(m.group(2)) + \ m.group(3) # Recoll treats "Subject" as a "title" element # (based on emails). The PDF "Subject" metadata # field is more like an HTML "description" line = re.sub(b'name="Subject"', b'name="Description"', line, 1) elif inbody: s = line[0:1] if s != b"\x0c" and s != b"<": isempty = False # We used to remove end-of-line hyphenation (and join # lines), but but it's not clear that we should do # this as pdftotext without the -layout option does it ? line = rclexecm.htmlescape(line) if re.search(b'<head>', line): inheader = True if re.search(b'<pre>', line): inbody = True output.append(line) return b'\n'.join(output), isempty
def html_text(self, filename): ok = False metadata = pyexiv2.ImageMetadata(filename) metadata.read() keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys mdic = {} for k in keys: # we skip numeric keys and undecoded makernote data if k != 'Exif.Photo.MakerNote' and not khexre.match(k): mdic[k] = str(metadata[k].raw_value) docdata = b'<html><head>\n' ttdata = set() for k in pyexiv2_titles: if k in mdic: ttdata.add(rclexecm.htmlescape(mdic[k])) if ttdata: title = "" for v in ttdata: v = v.replace('[', '').replace(']', '').replace("'", "") title += v + " " docdata += rclexecm.makebytes("<title>" + title + "</title>\n") for k in exiv2_dates: if k in mdic: # Recoll wants: %Y-%m-%d %H:%M:%S. # We get 2014:06:27 14:58:47 dt = mdic[k].replace(":", "-", 2) docdata += b'<meta name="date" content="' + \ rclexecm.makebytes(dt) + b'">\n' break for k,v in mdic.items(): if k == 'Xmp.digiKam.TagsList': docdata += b'<meta name="keywords" content="' + \ rclexecm.makebytes(rclexecm.htmlescape(mdic[k])) + \ b'">\n' docdata += b'</head><body>\n' for k,v in mdic.items(): docdata += rclexecm.makebytes(k + " : " + \ rclexecm.htmlescape(mdic[k]) + "<br />\n") docdata += b'</body></html>' return docdata
def takeLine(self, line): if not self.gotdata: self.out.append(b'<html><head>' + \ b'<meta http-equiv="Content-Type" ' + \ b'content="text/html;charset=UTF-8">' + \ b'</head><body><pre>') self.gotdata = True self.out.append(rclexecm.htmlescape(line))
def wrapData(self): if not self.gotdata: raise Exception("xls-dump returned no data") return b'' if self.ishtml: return b'\n'.join(self.out) handler = xlsxmltocsv.XlsXmlHandler() xml.sax.parseString(b'\n'.join(self.xmldata), handler) self.out.append(rclexecm.htmlescape(b'\n'.join(handler.output))) return b'\n'.join(self.out) + b'</pre></body></html>'
def _htmlwrapplain(txt, title=b"", charset=b"utf-8"): return \ b'<html>\n<head>\n<title>' + \ title + \ b'</title>\n' + \ b'<meta http-equiv="Content-Type" content="text/html; charset=' + \ charset + \ b'">\n' + \ b'<body>\n<pre>\n' + \ rclexecm.htmlescape(txt) + \ b'</pre>\n</body>\n</html>\n'
def html_text(self, fn): self.em.setmimetype('text/html') # Extract metadata metadata = b"" if self.djvused: try: metadata = subprocess.check_output( [self.djvused, fn, "-e", "select 1;print-meta"]) except Exception as e: self.em.rclog("djvused failed: %s" % e) author = "" title = "" metadata = metadata.decode('UTF-8', 'replace') for line in metadata.split('\n'): line = line.split('"') if len(line) >= 2: nm = line[0].strip() if nm == "author": author = ' '.join(line[1:]) elif nm == "title": title = ' '.join(line[1:]) # Main text txtdata = subprocess.check_output([self.djvutxt, fn]) txtdata = txtdata.decode('UTF-8', 'replace') data = '''<html><head>''' data += '''<title>''' + rclexecm.htmlescape(title) + '''</title>''' data += '''<meta http-equiv="Content-Type" ''' data += '''content="text/html;charset=UTF-8">''' if author: data += '''<meta name="author" content="''' + \ rclexecm.htmlescape(author) + '''">''' data += '''</head><body><pre>''' data += rclexecm.htmlescape(txtdata) data += '''</pre></body></html>''' return data
def _selfdoc(self): '''Extract the text from the pdf doc (as opposed to attachment)''' self.em.setmimetype('text/html') if self.attextractdone and len(self.attachlist) == 0: eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc", "UTF-8", "-eol", "unix", "-q", self.filename, "-"]) html, isempty = self._fixhtml(html) #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) if isempty: self.config.setKeyDir(os.path.dirname(self.filename)) s = self.config.getConfParam("pdfocr") if rclexecm.configparamtrue(s): try: cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), self.filename] data = subprocess.check_output(cmd) html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix except Exception as e: self.em.rclog("%s failed: %s" % (cmd, e)) pass if self.extrameta: try: html = self._setextrameta(html) except Exception as err: self.em.rclog("Metadata extraction failed: %s %s" % (err, traceback.format_exc())) if havepopplerglib: try: html = self._process_annotations(html) except Exception as err: self.em.rclog("Annotation extraction failed: %s %s" % (err, traceback.format_exc())) return (True, html, "", eof)
def _metatag(self, nm, val): return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \ rclexecm.htmlescape(rclexecm.makebytes(val)) + b"\">"