예제 #1
0
    def __init__(self, em):
        self.currentindex = 0
        self.pdftotext = None
        self.em = em

        self.confdir = rclconfig.RclConfig().getConfDir()
        cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr")
        cf_attach = rclconfig.RclConfig().getConfParam("pdfattach")

        self.pdftotext = rclexecm.which("pdftotext")
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")

        # Check if we need to escape portions of text where old
        # versions of pdftotext output raw HTML special characters.
        self.needescape = True
        try:
            version = subprocess.check_output([self.pdftotext, "-v"],
                                              stderr=subprocess.STDOUT)
            major, minor, rev = version.split()[2].split('.')
            # Don't know exactly when this changed but it's fixed in
            # jessie 0.26.5
            if int(major) > 0 or int(minor) >= 26:
                self.needescape = False
        except:
            pass

        # See if we'll try to perform OCR. Need the commands and the
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
        self.ocrpossible = False
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
            self.tesseract = rclexecm.which("tesseract")
            if self.tesseract:
                self.pdftoppm = rclexecm.which("pdftoppm")
                if self.pdftoppm:
                    self.ocrpossible = True
                    self.maybemaketmpdir()
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)

        # Pdftk is optionally used to extract attachments. This takes
        # a hit on perfmance even in the absence of any attachments,
        # so it can be disabled in the configuration.
        self.attextractdone = False
        self.attachlist = []
        if cf_attach:
            self.pdftk = rclexecm.which("pdftk")
        else:
            self.pdftk = None
        if self.pdftk:
            self.maybemaketmpdir()
예제 #2
0
 def __init__(self, logger, infile=sys.stdin):
     self.log = logger
     config = rclconfig.RclConfig()
     dir1 = os.path.join(config.getConfDir(), "examples")
     dir2 = os.path.join(config.datadir, "examples")
     self.mimemap = conftree.ConfStack('mimemap', [dir1, dir2])
     self.infile = infile
     self.fields = {}
     self.msg = EmailBuilder(self.log, self.mimemap)
예제 #3
0
파일: rclpdf.py 프로젝트: 314eter/recoll
    def __init__(self, em):
        self.currentindex = 0
        self.pdftotext = None
        self.pdfinfo = None
        self.pdftk = None
        self.em = em
        self.tesseract = None

        # Avoid picking up a default version on Windows, we want ours
        if not _mswindows:
            self.pdftotext = rclexecm.which("pdftotext")
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")
            if not self.pdftotext:
                # No need for anything else. openfile() will return an
                # error at once
                return

        self.config = rclconfig.RclConfig()
        self.confdir = self.config.getConfDir()
        # The user can set a list of meta tags to be extracted from
        # the XMP metadata packet. These are specified as
        # (xmltag,rcltag) pairs
        self.extrameta = self.config.getConfParam("pdfextrameta")
        if self.extrameta:
            self.extrametafix = self.config.getConfParam("pdfextrametafix")
            self._initextrameta()

        # Check if we need to escape portions of text where old
        # versions of pdftotext output raw HTML special characters.
        self.needescape = True
        try:
            version = subprocess.check_output([self.pdftotext, "-v"],
                                              stderr=subprocess.STDOUT)
            major,minor,rev = version.split()[2].split('.')
            # Don't know exactly when this changed but it's fixed in
            # jessie 0.26.5
            if int(major) > 0 or int(minor) >= 26:
                self.needescape = False
        except:
            pass
        
        # Pdftk is optionally used to extract attachments. This takes
        # a hit on performance even in the absence of any attachments,
        # so it can be disabled in the configuration.
        self.attextractdone = False
        self.attachlist = []
        cf_attach = self.config.getConfParam("pdfattach")
        cf_attach = rclexecm.configparamtrue(cf_attach)
        if cf_attach:
            self.pdftk = rclexecm.which("pdftk")
        if self.pdftk:
            self.maybemaketmpdir()
예제 #4
0
PY3 = sys.version > '3'

if PY3:
    def makebytes(data):
        if isinstance(data, bytes):
            return data
        else:
            return data.encode("UTF-8")
else:
    def makebytes(data):
        if isinstance(data, unicode):
            return data.encode("UTF-8")
        else:
            return data

my_config = rclconfig.RclConfig()

############################################
# RclExecM implements the
# communication protocol with the recollindex process. It calls the
# object specific of the document type to actually get the data.
class RclExecM:
    noteof = 0
    eofnext = 1
    eofnow = 2

    noerror = 0
    subdocerror = 1
    fileerror = 2
    
    def __init__(self):