Python detect_document_type 예제들, document_type.detect_document_type Python 예제들

예제 #1

0

파일 보기

 def convert_by_stream(self, data, format=LIBREOFFICE_OPEN_DOCUMENT):
     data.seek(0)
     input_stream = self._service_manager.createInstanceWithContext(
         "com.sun.star.io.SequenceInputStream", self._local_context)
     input_stream.initialize((uno.ByteSequence(data.read()), ))
     document = self._desktop.loadComponentFromURL(
         'private:stream', "_blank", 0,
         self._to_properties(InputStream=input_stream, ReadOnly=True))
     if not document:
         raise Exception, "Error making document"
     try:
         document.refresh()
     except AttributeError:
         pass
     output_stream = output_stream_wrapper()
     try:
         document.storeToURL(
             'private:stream',
             self._to_properties(OutputStream=output_stream,
                                 FilterName=format))
     finally:
         document.close(True)
     if format == LIBREOFFICE_OPEN_DOCUMENT:
         doc_type = document_type.detect_document_type(output_stream.data)
         if doc_type != document_type.types.oasis_open_document:
             raise docvert_exception.converter_unable_to_generate_open_document(
             )
     return output_stream.data

예제 #2

0

파일 보기

파일: docvert.py 프로젝트: gordonbanderson/docvert

def process_conversion(files=None, urls=None, pipeline_id=None, pipeline_type="pipelines", auto_pipeline_id=None, storage_type_name=docvert_storage.storage_type.memory_based, converter=converter_type.python_streaming_to_libreoffice):
    if files is None and urls is None:
        raise docvert_exception.needs_files_or_urls()
    if pipeline_id is None:
        raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" % pipeline_id)
    storage = docvert_storage.get_storage(storage_type_name)
    for filename, data in files.iteritems():
        doc_type = document_type.detect_document_type(data)
        if doc_type != document_type.types.oasis_open_document:
            data = generate_open_document(data, converter)
        document_xml = opendocument.extract_useful_open_document_files(data, storage, filename)
        process_pipeline(document_xml, pipeline_id, pipeline_type, auto_pipeline_id, storage, filename)
    return storage

예제 #3

0

파일 보기

def process_conversion(files=None, urls=None, pipeline_id=None, pipeline_type="pipelines", auto_pipeline_id=None, storage_type_name=docvert_storage.storage_type.memory_based, converter=converter_type.python_streaming_to_libreoffice, suppress_errors=False):
    if files is None and urls is None:
        raise docvert_exception.needs_files_or_urls()
    if pipeline_id is None:
        raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" % pipeline_id)
    storage = docvert_storage.get_storage(storage_type_name)

    def _title(name, files, data):
        filename = os.path.basename(name).replace('\\','-').replace('/','-').replace(':','-')
        if len(filename) == 0:
            filename = "document.odt"
        if files.has_key(filename):
            if data and hasattr(files[filename], 'read') and files[filename].getvalue() == data:
                return filename
            unique = 1
            potential_filename = filename
            while files.has_key(potential_filename):
                unique += 1
                if filename.count("."):
                    potential_filename = filename.replace(".", "%i." % unique, 1)
                else:
                    potential_filename = filename + str(unique)
            filename = potential_filename
        return filename

    for filename, data in files.iteritems():
        storage.set_friendly_name(filename, filename)

    for url in urls:
        try:
            data = urllib2.urlopen(url, None, http_timeout).read()
            doc_type = document_type.detect_document_type(data)
            if doc_type == document_type.types.html:
                data = html_to_opendocument(data, url)
            filename = _title(url, files, data)
            storage.set_friendly_name(filename, "%s (%s)" % (filename, url))
            files[filename] = StringIO.StringIO(data)
        except IOError, e:
            filename = _title(url, files, None)
            storage.set_friendly_name(filename, "%s (%s)" % (filename, url))
            files[filename] = Exception("Download error from %s: %s" % (url, e))

예제 #4

0

파일 보기

파일: docvert.py 프로젝트: gordonbanderson/docvert

def process_conversion(
        files=None,
        urls=None,
        pipeline_id=None,
        pipeline_type="pipelines",
        auto_pipeline_id=None,
        storage_type_name=docvert_storage.storage_type.memory_based,
        converter=converter_type.python_streaming_to_libreoffice):
    if files is None and urls is None:
        raise docvert_exception.needs_files_or_urls()
    if pipeline_id is None:
        raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" %
                                                      pipeline_id)
    storage = docvert_storage.get_storage(storage_type_name)
    for filename, data in files.iteritems():
        doc_type = document_type.detect_document_type(data)
        if doc_type != document_type.types.oasis_open_document:
            data = generate_open_document(data, converter)
        document_xml = opendocument.extract_useful_open_document_files(
            data, storage, filename)
        process_pipeline(document_xml, pipeline_id, pipeline_type,
                         auto_pipeline_id, storage, filename)
    return storage

예제 #5

0

파일 보기

파일: docvert_libreoffice.py 프로젝트: Br3nda/docvert

 def convert_by_stream(self, data, format=LIBREOFFICE_OPEN_DOCUMENT):
     data.seek(0)
     input_stream = self._service_manager.createInstanceWithContext("com.sun.star.io.SequenceInputStream", self._local_context)
     input_stream.initialize((uno.ByteSequence(data.read()),)) 
     document = self._desktop.loadComponentFromURL('private:stream', "_blank", 0, self._to_properties(InputStream=input_stream,ReadOnly=True))
     if not document:
         raise Exception, "Error making document"
     try:
         document.refresh()
     except AttributeError:
         pass
     output_stream = output_stream_wrapper()
     try:
         document.storeToURL('private:stream', self._to_properties(
             OutputStream=output_stream,
             FilterName=format))
     finally:
         document.close(True)
     if format == LIBREOFFICE_OPEN_DOCUMENT:
         doc_type = document_type.detect_document_type(output_stream.data)
         if doc_type != document_type.types.oasis_open_document:
             raise docvert_exception.converter_unable_to_generate_open_document()
     return output_stream.data

예제 #6

0

파일 보기

     try:
         data = urllib2.urlopen(url, None, http_timeout).read()
         doc_type = document_type.detect_document_type(data)
         if doc_type == document_type.types.html:
             data = html_to_opendocument(data, url)
         filename = _title(url, files, data)
         storage.set_friendly_name(filename, "%s (%s)" % (filename, url))
         files[filename] = StringIO.StringIO(data)
     except IOError, e:
         filename = _title(url, files, None)
         storage.set_friendly_name(filename, "%s (%s)" % (filename, url))
         files[filename] = Exception("Download error from %s: %s" % (url, e))
 for filename, data in files.iteritems():
     if storage.default_document is None:
         storage.default_document = filename
     doc_type = document_type.detect_document_type(data)
     if doc_type == document_type.types.exception:
         storage.add("%s/index.txt" % filename, str(data))
     elif doc_type != document_type.types.oasis_open_document:
         try:
             data = generate_open_document(data, converter)
             doc_type = document_type.types.oasis_open_document
         except Exception, e:
             if not suppress_errors:
                 raise e
             storage.add("%s/index.txt" % filename, str(e))
     if doc_type == document_type.types.oasis_open_document:
         if pipeline_id == "open document": #reserved term, for when people want the Open Document file back directly. Don't bother loading pipeline.
             storage.add("%s/index.odt" % filename, data)
             thumbnail = opendocument.extract_thumbnail(data)
             if thumbnail:

예제 #7

0

파일 보기

파일: docvert_libreoffice.py 프로젝트: gordonbanderson/docvert

        document = self._desktop.loadComponentFromURL('private:stream', "_blank", 0, self._to_properties(InputStream=input_stream,ReadOnly=True))
        if not document:
            raise Exception, "Error making document"
        try:
            document.refresh()
        except AttributeError:
            pass
        output_stream = output_stream_wrapper()
        try:
            document.storeToURL('private:stream', self._to_properties(OutputStream=output_stream, FilterName=format))
        except Exception, e: #ignore any error, verify the output before complaining
            pass
        finally:
            document.close(True)
        if format == LIBREOFFICE_OPEN_DOCUMENT or format == LIBREOFFICE_PDF:
            doc_type = document_type.detect_document_type(output_stream.data)
            output_stream.data.seek(0)
            if format == LIBREOFFICE_OPEN_DOCUMENT and doc_type != document_type.types.oasis_open_document:
                raise docvert_exception.converter_unable_to_generate_open_document("Unable to generate OpenDocument, was detected as %s. First 2 bytes = %s" % (doc_type, output_stream.data.read(2)))
            elif format == LIBREOFFICE_PDF and doc_type != document_type.types.pdf:
                raise docvert_exception.converter_unable_to_generate_pdf("Unable to generate PDF, was detected as %s. First 4 bytes = %s" % (doc_type, output_stream.data.read(4)))
        return output_stream.data

    def _to_properties(self, **args):
        props = []
        for key in args:
            prop = PropertyValue()
            prop.Name = key
            prop.Value = args[key]
            props.append(prop)
        return tuple(props)

예제 #8

0

파일 보기

파일: docvert_libreoffice.py 프로젝트: gordonbanderson/docvert

        try:
            document.refresh()
        except AttributeError:
            pass
        output_stream = output_stream_wrapper()
        try:
            document.storeToURL(
                'private:stream',
                self._to_properties(OutputStream=output_stream,
                                    FilterName=format))
        except Exception, e:  #ignore any error, verify the output before complaining
            pass
        finally:
            document.close(True)
        if format == LIBREOFFICE_OPEN_DOCUMENT or format == LIBREOFFICE_PDF:
            doc_type = document_type.detect_document_type(output_stream.data)
            output_stream.data.seek(0)
            if format == LIBREOFFICE_OPEN_DOCUMENT and doc_type != document_type.types.oasis_open_document:
                raise docvert_exception.converter_unable_to_generate_open_document(
                    "Unable to generate OpenDocument, was detected as %s. First 2 bytes = %s"
                    % (doc_type, output_stream.data.read(2)))
            elif format == LIBREOFFICE_PDF and doc_type != document_type.types.pdf:
                raise docvert_exception.converter_unable_to_generate_pdf(
                    "Unable to generate PDF, was detected as %s. First 4 bytes = %s"
                    % (doc_type, output_stream.data.read(4)))
        return output_stream.data

    def _to_properties(self, **args):
        props = []
        for key in args:
            prop = PropertyValue()