def __init__(self, em): self.currentindex = 0 self.pdftotext = None self.em = em self.confdir = rclconfig.RclConfig().getConfDir() cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr") cf_attach = rclconfig.RclConfig().getConfParam("pdfattach") self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") # Check if we need to escape portions of text where old # versions of pdftotext output raw HTML special characters. self.needescape = True try: version = subprocess.check_output([self.pdftotext, "-v"], stderr=subprocess.STDOUT) major, minor, rev = version.split()[2].split('.') # Don't know exactly when this changed but it's fixed in # jessie 0.26.5 if int(major) > 0 or int(minor) >= 26: self.needescape = False except: pass # See if we'll try to perform OCR. Need the commands and the # either the presence of a file in the config dir (historical) # or a set config variable. self.ocrpossible = False if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")): self.tesseract = rclexecm.which("tesseract") if self.tesseract: self.pdftoppm = rclexecm.which("pdftoppm") if self.pdftoppm: self.ocrpossible = True self.maybemaketmpdir() # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible) # Pdftk is optionally used to extract attachments. This takes # a hit on perfmance even in the absence of any attachments, # so it can be disabled in the configuration. self.attextractdone = False self.attachlist = [] if cf_attach: self.pdftk = rclexecm.which("pdftk") else: self.pdftk = None if self.pdftk: self.maybemaketmpdir()
def __init__(self, logger, infile=sys.stdin): self.log = logger config = rclconfig.RclConfig() dir1 = os.path.join(config.getConfDir(), "examples") dir2 = os.path.join(config.datadir, "examples") self.mimemap = conftree.ConfStack('mimemap', [dir1, dir2]) self.infile = infile self.fields = {} self.msg = EmailBuilder(self.log, self.mimemap)
def __init__(self, em): self.currentindex = 0 self.pdftotext = None self.pdfinfo = None self.pdftk = None self.em = em self.tesseract = None # Avoid picking up a default version on Windows, we want ours if not _mswindows: self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") if not self.pdftotext: # No need for anything else. openfile() will return an # error at once return self.config = rclconfig.RclConfig() self.confdir = self.config.getConfDir() # The user can set a list of meta tags to be extracted from # the XMP metadata packet. These are specified as # (xmltag,rcltag) pairs self.extrameta = self.config.getConfParam("pdfextrameta") if self.extrameta: self.extrametafix = self.config.getConfParam("pdfextrametafix") self._initextrameta() # Check if we need to escape portions of text where old # versions of pdftotext output raw HTML special characters. self.needescape = True try: version = subprocess.check_output([self.pdftotext, "-v"], stderr=subprocess.STDOUT) major,minor,rev = version.split()[2].split('.') # Don't know exactly when this changed but it's fixed in # jessie 0.26.5 if int(major) > 0 or int(minor) >= 26: self.needescape = False except: pass # Pdftk is optionally used to extract attachments. This takes # a hit on performance even in the absence of any attachments, # so it can be disabled in the configuration. self.attextractdone = False self.attachlist = [] cf_attach = self.config.getConfParam("pdfattach") cf_attach = rclexecm.configparamtrue(cf_attach) if cf_attach: self.pdftk = rclexecm.which("pdftk") if self.pdftk: self.maybemaketmpdir()
PY3 = sys.version > '3' if PY3: def makebytes(data): if isinstance(data, bytes): return data else: return data.encode("UTF-8") else: def makebytes(data): if isinstance(data, unicode): return data.encode("UTF-8") else: return data my_config = rclconfig.RclConfig() ############################################ # RclExecM implements the # communication protocol with the recollindex process. It calls the # object specific of the document type to actually get the data. class RclExecM: noteof = 0 eofnext = 1 eofnow = 2 noerror = 0 subdocerror = 1 fileerror = 2 def __init__(self):