Пример #1
0
        def cached_getTargetFormatItemList(content_type):
            from xmlrpclib import Fault
            server_proxy = DocumentConversionServerProxy(self)
            try:
                allowed_target_item_list = server_proxy.getAllowedTargetItemList(
                    content_type)
                try:
                    response_code, response_dict, response_message = \
                                                       allowed_target_item_list
                except ValueError:
                    # Compatibility with older oood where getAllowedTargetItemList only
                    # returned response_dict
                    response_code, response_dict, response_message = \
                                   200, dict(response_data=allowed_target_item_list), ''

                if response_code == 200:
                    allowed = response_dict['response_data']
                else:
                    # This is very temporary code - XXX needs to be changed
                    # so that the system can retry
                    raise ConversionError(
                        "OOoDocument: can not get list of allowed acceptable"
                        " formats for conversion (Code %s: %s)" %
                        (response_code, response_message))

            except Fault:
                allowed = server_proxy.getAllowedTargets(content_type)
                warn(
                    'Your oood version is too old, using old method '
                    'getAllowedTargets instead of getAllowedTargetList',
                    DeprecationWarning)

            # tuple order is reversed to be compatible with ERP5 Form
            return [(y, x) for x, y in allowed]
Пример #2
0
    def _convertToHTML(self):
        """Convert the PDF text content to HTML with pdftohtml
    """
        if not self.hasData():
            return ''
        tmp = tempfile.NamedTemporaryFile()
        tmp.write(self.getData())
        tmp.seek(0)

        command_result = None
        try:
            command = [
                'pdftohtml', '-enc', 'UTF-8', '-stdout', '-noframes', '-i',
                tmp.name
            ]
            try:
                command_result = Popen(command, stdout=PIPE).communicate()[0]
            except OSError, e:
                if e.errno == errno.ENOENT:
                    raise ConversionError('pdftohtml was not found')
                raise

        finally:
            tmp.close()
        # Quick hack to remove bg color - XXX
        h = command_result.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
        # Make links relative
        h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
                      'href="asEntireHTML')
        return h
Пример #3
0
    def updateBaseMetadata(self, **kw):
        """
      Updates metadata information in the converted OOo document
      based on the values provided by the user. This is implemented
      through the invocation of the conversion server.
    """
        if not self.hasBaseData():
            # XXX please pass a meaningful description of error as argument
            raise NotConvertedError()

        server_proxy = DocumentConversionServerProxy(self)
        response_code, response_dict, response_message = \
              server_proxy.run_setmetadata(self.getId(),
                                           enc(str(self.getBaseData())),
                                           kw)
        if response_code == 200:
            # successful meta data extraction
            self._setBaseData(dec(response_dict['data']))
            self.updateFileMetadata(
            )  # record in workflow history # XXX must put appropriate comments.
        else:
            # Explicitly raise the exception!
            raise ConversionError(
                "OOoDocument: error getting document metadata (Code %s: %s)" %
                (response_code, response_message))
Пример #4
0
def getDataURI(url):
    try:
        data = urllib2.urlopen(url)
    except Exception, e:
        raise ConversionError(
            "Error to transform url (%s) into data uri. ERROR = %s" %
            (url, Exception(e)))
Пример #5
0
 def _getFormatFromMimetype(self, mimetype):
     """
 XXX: This should not be done here but Conversion Server API to get
      supported Format/Extension is deprecated (topic under discussion)
 """
     import mimetypes
     extension = mimetypes.guess_extension(mimetype)
     if extension is None:
         raise ConversionError(
             "Could not guess extension from mimetype '%s'" % mimetype)
     return extension.split('.', 1)[1]
Пример #6
0
 def _convertToText(self, format='txt'):  # pylint: disable=redefined-builtin
     """
   Convert the PDF text content to text with pdftotext
 """
     if not self.hasData():
         return ''
     mime_type = 'text/plain'
     portal_transforms = self.getPortalObject().portal_transforms
     filename = self.getFilename()
     result = portal_transforms.convertToData(
         mime_type,
         str(self.getData()),
         context=self,
         filename=filename,
         mimetype=self.getContentType())
     if result:
         return result
     else:
         # Try to use OCR
         # As high dpi images are required, it may take some times to convert the
         # pdf.
         # It may be required to use activities to fill the cache and at the end,
         # to calculate the final result
         text = ''
         content_information = self.getContentInformation()
         page_count = int(content_information.get('Pages', 0))
         for page_number in range(page_count):
             src_mimetype, png_data = self._convert('png',
                                                    quality=100,
                                                    resolution=300,
                                                    frame=page_number,
                                                    display='identical')
             if not src_mimetype.endswith('png'):
                 continue
             content = str(png_data)
             if content is not None:
                 filename = self.getStandardFilename(format='png')
                 result = portal_transforms.convertToData(
                     mime_type,
                     content,
                     context=self,
                     filename=filename,
                     mimetype=src_mimetype)
                 if result is None:
                     raise ConversionError(
                         'PDFDocument conversion error. '
                         'portal_transforms failed to convert to %s: %r' %
                         (mime_type, self))
                 text += result
         return text
Пример #7
0
    def convertTo(self, format):
        # XXX Must be replaced by portal_data_adapters soon
        from erp5.component.document.Document import DocumentConversionServerProxy

        server_proxy = DocumentConversionServerProxy(self.context)
        response_code, response_dict, message = \
                               server_proxy.getAllowedTargetItemList(self.mimetype)
        allowed_extension_list = response_dict['response_data']
        if format in dict(allowed_extension_list):
            # XXX Must be replaced by portal_data_adapters soon
            from erp5.component.document.Document import enc, dec

            response_code, response_dict, message = server_proxy.run_generate(
                '', enc(self.data), None, format, self.mimetype)
            data = dec(response_dict['data'])
            if self.mimetype == 'text/html':
                data = self.includeImageList(data)
            return data
        else:
            raise ConversionError('Format not allowed %s' % format)
Пример #8
0
    def _convertToDJVU(self):
        """Convert the PDF text content to DJVU with pdf2djvu
    """
        if not self.hasData():
            return ''
        tmp = tempfile.NamedTemporaryFile()
        tmp.write(self.getData())
        tmp.seek(0)

        command_result = None
        try:
            command = ['pdf2djvu', tmp.name]
            try:
                command_result = Popen(command, stdout=PIPE).communicate()[0]
            except OSError, e:
                if e.errno == errno.ENOENT:
                    raise ConversionError('pdf2djvu was not found')
                raise

        finally:
            tmp.close()
        return command_result
Пример #9
0
 def _convertToBaseFormat(self):
     """
   Converts the original document into ODF
   by invoking the conversion server. Store the result
   on the object. Update metadata information.
 """
     server_proxy = DocumentConversionServerProxy(self)
     response_code, response_dict, response_message = server_proxy.run_convert(
         self.getFilename() or self.getId(), enc(str(self.getData())), None,
         None, self.getContentType())
     if response_code == 200:
         # sucessfully converted document
         self._setBaseData(dec(response_dict['data']))
         metadata = response_dict['meta']
         self._base_metadata = metadata
         if metadata.get('MIMEType', None) is not None:
             self._setBaseContentType(metadata['MIMEType'])
     else:
         # Explicitly raise the exception!
         raise ConversionError(
             "OOoDocument: Error converting document to base format. (Code %s: %s)"
             % (response_code, response_message))
Пример #10
0
    def convert(self, orig, data, context=None, **kwargs):
        server_proxy = DocumentConversionServerProxy(context)

        source_mimetype = self._getAllowedSourceMimetypeFromConversionServer(
            server_proxy)
        if source_mimetype is None:
            raise ConversionError(
                "Format(s) not allowed on Conversion Server %r" % self.inputs)
        source_format = self._getFormatFromMimetype(source_mimetype)
        destination_format = self._getFormatFromMimetype(self.output)

        data.setData(
            dec(
                server_proxy.convertFile(
                    enc(orig),
                    source_format,
                    destination_format,
                    # Default values are ConversionServer default ones
                    kwargs.get('zip', False),
                    kwargs.get('refresh', False),
                    kwargs.get('conversion_kw', {}))))

        return data
Пример #11
0
    def _resize(
        self,
        quality,
        width,
        height,
        format,  # pylint: disable=redefined-builtin
        resolution,
        frame,
        crop=False,
    ):
        """Resize and resample photo."""
        # https://github.com/saucecontrol/Compact-ICC-Profiles
        icc_profile = os.path.join(os.path.dirname(Products.ERP5.__file__),
                                   'misc', 'sRGB-v2-magic.icc')
        parameter_list = [
            'convert', '-colorspace', 'sRGB', '-depth', '8', '-profile',
            icc_profile
        ]
        if crop:
            parameter_list += '-thumbnail', '%sx%s^' % (width, height),\
                              '-gravity', 'center',\
                              '-extent','%sx%s' % (width, height)
        else:
            parameter_list += '-geometry', '%sx%s' % (width, height)
        parameter_list += '-quality', str(quality)
        if format not in VALID_TRANSPARENT_IMAGE_FORMAT_LIST:
            # ImageMagick way to remove transparent that works with multiple
            # images. http://www.imagemagick.org/Usage/masking/#remove
            parameter_list += '-bordercolor', 'white', '-border', '0'
        if resolution:
            parameter_list += '-density', '%sx%s' % (resolution, resolution)
        if frame is not None:
            parameter_list.append('-[%s]' % frame)
        else:
            parameter_list.append('-')

        if format:
            # Is there a way to make 'convert' fail if the format is unknown,
            # instead of treating this whole parameter as an output file path?
            # As a workaround, we run 'convert' in a non-writeable directory.
            if '/' in format or os.access('/', os.W_OK):
                raise ConversionError
            parameter_list.append('%s:-' % format)
        else:
            parameter_list.append('-')

        data = str(self.getData())
        if self.getContentType() == "image/svg+xml":
            data = transformUrlToDataURI(data)

        env = os.environ.copy()
        env.update({'LC_NUMERIC': 'C'})
        process = subprocess.Popen(parameter_list,
                                   env=env,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   cwd='/',
                                   close_fds=True)
        try:
            # XXX: The only portable way is to pass what stdin.write can accept,
            #      which is a string for PIPE.
            image, err = process.communicate(data)
        finally:
            del process
        if image:
            return StringIO(image)
        raise ConversionError('Image conversion failed (%s).' % err)
Пример #12
0
    def getContentInformation(self):
        """Returns the information about the PDF document with pdfinfo.
    """
        if not self.hasData():
            return {}
        try:
            return self._content_information.copy()  # pylint: disable=access-member-before-definition
        except AttributeError:
            pass
        tmp = tempfile.NamedTemporaryFile()
        tmp.write(self.getData())
        tmp.seek(0)
        command_result = None
        try:

            # First, we use pdfinfo to get standard metadata
            command = ['pdfinfo', '-meta', '-box', tmp.name]
            try:
                command_result = Popen(command, stdout=PIPE).communicate()[0]
            except OSError, e:
                if e.errno == errno.ENOENT:
                    raise ConversionError('pdfinfo was not found')
                raise

            result = {}
            for line in command_result.splitlines():
                item_list = line.split(':')
                key = item_list[0].strip()
                value = ':'.join(item_list[1:]).strip()
                result[key] = value

            # Then we use PyPDF2 to get extra metadata
            try:
                from PyPDF2 import PdfFileReader
                from PyPDF2.utils import PdfReadError
            except ImportError:
                # if PyPDF2 not found, pass
                pass
            else:
                try:
                    pdf_file = PdfFileReader(tmp)
                    for info_key, info_value in (pdf_file.getDocumentInfo()
                                                 or {}).iteritems():
                        info_key = info_key.lstrip("/")
                        if isinstance(info_value, unicode):
                            info_value = info_value.encode("utf-8")

                        # Ignore values that cannot be pickled ( such as AAPL:Keywords )
                        try:
                            pickle.dumps(info_value)
                        except pickle.PicklingError:
                            LOG(
                                "PDFDocument.getContentInformation", INFO,
                                "Ignoring non picklable document info on %s: %s (%r)"
                                %
                                (self.getRelativeUrl(), info_key, info_value))
                        else:
                            result.setdefault(info_key, info_value)
                except (PdfReadError, AssertionError):
                    LOG("PDFDocument.getContentInformation", PROBLEM,
                      "PyPDF2 is Unable to read PDF, probably corrupted PDF here : %s" % \
                      (self.getRelativeUrl(),))
                except Exception:
                    # an exception of Exception class will be raised when the
                    # document is encrypted.
                    pass
Пример #13
0
    def _convert(self, format, frame=0, **kw):  #  pylint: disable=redefined-builtin
        """Convert the document to the given format.

    If a conversion is already stored for this format, it is returned
    directly, otherwise the conversion is stored for the next time.

    frame: Only used for image conversion

    XXX Cascading conversions must be delegated to conversion server,
    not by OOoDocument._convert (ie: convert to pdf, then convert to image, then resize)
    *OR* as an optimisation we can read cached intermediate conversions
    instead of compute them each times.
      1- odt->pdf->png
      2- odt->cached(pdf)->jpg
    """
        #XXX if document is empty, stop to try to convert.
        #XXX but I don't know what is a appropriate mime-type.(Yusei)
        if not self.hasData():
            return 'text/plain', ''
        # if no conversion asked (format empty)
        # return raw data
        if not format:
            return self.getContentType(), self.getData()
        # Check if we have already a base conversion
        if not self.hasBaseData():
            # XXX please pass a meaningful description of error as argument
            raise NotConvertedError()
        # Make sure we can support html and pdf by default
        is_html = 0
        requires_pdf_first = 0
        original_format = format
        allowed_format_list = self.getTargetFormatList()
        if format == 'base-data':
            return self.getBaseContentType(), str(self.getBaseData())
        if format == 'pdf':
            format_list = [x for x in allowed_format_list if x.endswith('pdf')]
            format = format_list[0]
        elif format in VALID_IMAGE_FORMAT_LIST:
            format_list = [
                x for x in allowed_format_list if x.endswith(format)
            ]
            if len(format_list):
                format = format_list[0]
            else:
                # We must fist make a PDF which will be used to produce an image out of it
                requires_pdf_first = 1
                format_list = [
                    x for x in allowed_format_list if x.endswith('pdf')
                ]
                format = format_list[0]
        elif format == 'html':
            format_list = [
                x for x in allowed_format_list
                if x.startswith('html') or x.endswith('html')
            ]
            format = format_list[0]
            is_html = 1
        elif format in ('txt', 'text', 'text-content'):
            # if possible, we try to get utf8 text. ('enc.txt' will encode to utf8)
            if 'enc.txt' in allowed_format_list:
                format = 'enc.txt'
            elif format not in allowed_format_list:
                #Text conversion is not supported by oood, do it in other way
                if not self.hasConversion(format=original_format):
                    #Do real conversion for text
                    mime, data = self._getConversionFromProxyServer(
                        format='text-content')
                    self.setConversion(data, mime, format=original_format)
                    return mime, data
                return self.getConversion(format=original_format)
        # Raise an error if the format is not supported
        if not self.isTargetFormatAllowed(format):
            raise ConversionError(
                "OOoDocument: target format %s is not supported" % format)
        has_format = self.hasConversion(format=original_format, **kw)
        if not has_format:
            # Do real conversion
            mime, data = self._getConversionFromProxyServer(format)
            if is_html:
                # Extra processing required since
                # we receive a zip file
                cs = cStringIO.StringIO()
                cs.write(str(data))
                z = zipfile.ZipFile(
                    cs)  # A disk file would be more RAM efficient
                for f in z.infolist():
                    fn = f.filename
                    if fn.endswith('html'):
                        if self.getPortalType() == 'Presentation'\
                              and not (fn.find('impr') >= 0):
                            continue
                        data = z.read(fn)
                        break
                mime = 'text/html'
                self._populateConversionCacheWithHTML(
                    zip_file=z)  # Maybe some parts should be asynchronous for
                # better usability
                z.close()
                cs.close()
            if original_format not in VALID_IMAGE_FORMAT_LIST \
              and not requires_pdf_first:
                self.setConversion(data, mime, format=original_format, **kw)
            else:
                # create temporary image and use it to resize accordingly
                temp_image = self.portal_contributions.newContent(
                    portal_type='Image',
                    file=cStringIO.StringIO(),
                    filename=self.getId(),
                    temp_object=1)
                temp_image._setData(data)
                # we care for first page only but as well for image quality
                mime, data = temp_image.convert(original_format,
                                                frame=frame,
                                                **kw)
                # store conversion
                self.setConversion(data, mime, format=original_format, **kw)

        return self.getConversion(format=original_format, **kw)
Пример #14
0
    def _convertToText(self, format='txt'):  # pylint: disable=redefined-builtin
        """Convert the PDF to text

    If the PDF have text, return the text, otherwise try to do OCR using
    tesseract.
    """
        if not self.hasData():
            return ''
        data = str(self.getData())
        try:
            from PyPDF2 import PdfFileReader
            from PyPDF2.utils import PdfReadError
        except ImportError:
            pass
        else:
            try:
                if PdfFileReader(StringIO(data)).isEncrypted:
                    return ''
            except PdfReadError:
                return ''

        mime_type = 'text/plain'
        portal_transforms = self.getPortalObject().portal_transforms
        filename = self.getFilename()
        result = portal_transforms.convertToData(
            mime_type,
            data,
            context=self,
            filename=filename,
            mimetype=self.getContentType())
        if result:
            return result
        else:
            # Try to use OCR from ghostscript, but tolerate that the command might
            # not be available.
            process = None
            command = [
                'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE',
                '-dNOPROMPT', '-sDEVICE=ocr', '-r300x300', '-o', '-', '-f', '-'
            ]
            try:
                process = Popen(
                    command,
                    stdin=PIPE,
                    stdout=PIPE,
                    stderr=PIPE,
                    close_fds=True,
                )
                output, error = process.communicate(data)
                if process.returncode:
                    raise ConversionError(
                        "Error invoking ghostscript.\noutput:%s\nerror:%s" %
                        (output, error))
                return output.strip()
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise
            finally:
                del process

            # We don't have ghostscript, fallback to the expensive pipeline using:
            #   pdf -- (Image._convert imagemagick) --> png
            #       -- (PortalTransforms.png_to_tiff imagemagick) --> tiff
            #       -- (PortalTransforms.tiff_to_text tesseract) --> text
            #
            # As high dpi images are required, it may take some times to convert the
            # pdf.
            # It may be required to use activities to fill the cache and at the end,
            # to calculate the final result
            text = ''
            content_information = self.getContentInformation()
            page_count = int(content_information.get('Pages', 0))
            for page_number in range(page_count):
                src_mimetype, png_data = self._convert('png',
                                                       quality=100,
                                                       resolution=300,
                                                       frame=page_number,
                                                       display='identical')
                if not src_mimetype.endswith('png'):
                    continue
                content = str(png_data)
                if content is not None:
                    filename = self.getStandardFilename(format='png')
                    result = portal_transforms.convertToData(
                        mime_type,
                        content,
                        context=self,
                        filename=filename,
                        mimetype=src_mimetype)
                    if result is None:
                        raise ConversionError(
                            'PDFDocument conversion error. '
                            'portal_transforms failed to convert to %s: %r' %
                            (mime_type, self))
                    text += result
            return text
Пример #15
0
    def _convert(
            self,
            format,
            substitution_method_parameter_dict=None,  # pylint: disable=redefined-builtin
            safe_substitute=True,
            charset=None,
            text_content=None,
            substitute=True,
            **kw):
        """
      Convert text using portal_transforms or oood
    """
        # XXX 'or DEFAULT_CONTENT_TYPE' is compaptibility code used for old
        # web_page that have neither content_type nor text_format. Migration
        # should be done to make all web page having content_type property
        src_mimetype = self.getContentType() or DEFAULT_CONTENT_TYPE
        if not format and src_mimetype == 'text/html':
            format = 'html'  # Force safe_html
        if not format:
            # can return document without conversion
            return src_mimetype, self.getTextContent()
        portal = self.getPortalObject()
        mime_type = portal.mimetypes_registry.lookupExtension('name.%s' %
                                                              format)
        original_mime_type = mime_type = str(mime_type)
        if text_content is None:
            # check if document has set text_content and convert if necessary
            text_content = self.getTextContent()
        if text_content:
            kw['format'] = format
            convert_kw = {}
            # PortalTransforms does not accept empty values for 'encoding' parameter
            if charset:
                kw['charset'] = convert_kw['encoding'] = charset
            if not self.hasConversion(**kw):
                portal_transforms = portal.portal_transforms
                filename = self.getFilename()
                if mime_type == 'text/html':
                    mime_type = 'text/x-html-safe'
                if src_mimetype != "image/svg+xml":
                    result = portal_transforms.convertToData(
                        mime_type,
                        text_content,
                        object=self,
                        context=self,
                        filename=filename,
                        mimetype=src_mimetype,
                        **convert_kw)
                    if result is None:
                        raise ConversionError(
                            'TextDocument conversion error. '
                            'portal_transforms failed to convert '
                            'from %r to %s: %r' %
                            (src_mimetype, mime_type, self))
                else:
                    result = text_content
                if format in VALID_IMAGE_FORMAT_LIST:
                    # Include extra parameter for image conversions
                    temp_image = self.portal_contributions.newContent(
                        portal_type='Image',
                        file=BytesIO(),
                        filename=self.getId(),
                        temp_object=1)
                    temp_image._setData(result)
                    _, result = temp_image.convert(**kw)

                self.setConversion(result, original_mime_type, **kw)
            else:
                mime_type, result = self.getConversion(**kw)
            if substitute and format in VALID_TEXT_FORMAT_LIST:
                # only textual content can be sustituted
                if substitution_method_parameter_dict is None:
                    substitution_method_parameter_dict = {}
                result = self._substituteTextContent(
                    result,
                    safe_substitute=safe_substitute,
                    **substitution_method_parameter_dict)
            return original_mime_type, result
        else:
            # text_content is not set, return empty string instead of None
            return original_mime_type, ''