示例#1
0
 def test_wrong_option_name_format(self):
     # we check format of options. They must start with dashes.
     # short name must have format '-XXX'
     with pytest.raises(ValueError):
         Argument('opt-name-wo-dash')
     # long name must have format '--XXX'
     with pytest.raises(ValueError):
         Argument('-myproc-opt1', '-myproc-option1')
示例#2
0
class CSSCleaner(BaseProcessor):
    """A processor for cleaning up CSS parts of HTML code.

    Normal converters leave CSS inside an HTML document. This
    processor first aggregates these style parts and then puts it into
    an external CSS file leaving only a link to that file.

    This processor requires HTML/XHTML input.
    """
    prefix = 'css_cleaner'

    args = [
        Argument(
            '-css-cleaner-min',
            '--css-cleaner-minified',
            type=boolean,
            default=True,
            metavar='YES|NO',
            help='Whether to minify generated CSS (when handling HTML) '
            'Default: yes',
        ),
        Argument(
            '-css-cleaner-prettify',
            '--css-cleaner-prettify-html',
            type=boolean,
            default=False,
            metavar='YES|NO',
            help='Prettify generated HTML (may lead to gaps in '
            'rendered output) Default: no',
        ),
    ]

    supported_extensions = ['.html', '.xhtml']

    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(copy_to_secure_location(path), basename)
        remove_file_dir(path)

        new_html, css = extract_css(
            open(src_path, 'rb').read().decode('utf-8'),
            basename,
            prettify_html=self.options['css_cleaner_prettify_html'])
        css, errors = cleanup_css(
            css, minified=self.options['css_cleaner_minified'])

        css_file = os.path.splitext(src_path)[0] + '.css'
        if css is not None:
            with open(css_file, 'wb') as fd:
                fd.write(css.encode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(new_html.encode('utf-8'))

        return src_path, metadata
示例#3
0
 def test_default_string(self):
     # we can get the defaults as string
     assert Argument(  # ints
         '-my-opt1', None, default=1).default_string == '1'
     assert Argument(  # strings
         '-my-opt1', None, default='foo').default_string == 'foo'
     assert Argument(  # bools
         '-my-opt1', None, default=True).default_string == 'yes'
     assert Argument(  # lists
         '-my-opt1', None, default=['a', 'b']).default_string == 'a, b'
     assert Argument(  # tuples
         '-my-opt1', None, default=('a', 'b')).default_string == 'a, b'
     assert Argument(  # None
         '-my-opt1', None).default_string == 'None'
示例#4
0
 def test_regular(self):
     # normally we pass in args and keywords
     arg = Argument('-myproc-opt1', '--myproc-option1', choice=[1, 2, 3])
     assert arg.short_name == '-myproc-opt1'
     assert arg.long_name == '--myproc-option1'
     assert arg.keywords['choice'] == [1, 2, 3]
示例#5
0
class HTMLCleaner(BaseProcessor):
    """A processor for cleaning up HTML produced by OO.org.

    Fixes minor issues with HTML code produced by OO.org.

    This processor expects XHTML input input.
    """
    prefix = 'html_cleaner'

    args = [
        Argument(
            '-html-cleaner-fix-head-nums',
            '--html-cleaner-fix-heading-numbers',
            type=boolean,
            default=True,
            metavar='YES|NO',
            help='Whether to fix heading numbers in generated HTML '
            'Default: yes',
        ),
        Argument(
            '-html-cleaner-fix-img-links',
            '--html-cleaner-fix-image-links',
            type=boolean,
            default=True,
            metavar='YES|NO',
            help='Whether to fix heading numbers in generated HTML '
            'Default: yes',
        ),
        Argument(
            '-html-cleaner-fix-sd-fields',
            '--html-cleaner-fix-sd-fields',
            type=boolean,
            default=True,
            metavar='YES|NO',
            help='Whether to fix SD fields in HTML generated by '
            'LibreOffice. Default: yes',
        ),
    ]

    supported_extensions = ['.html', '.xhtml']

    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(copy_to_secure_location(path), basename)
        src_dir = os.path.dirname(src_path)
        remove_file_dir(path)
        new_html, img_name_map = cleanup_html(
            codecs.open(src_path, 'r', 'utf-8').read(),
            basename,
            fix_head_nums=self.options['html_cleaner_fix_heading_numbers'],
            fix_img_links=self.options['html_cleaner_fix_image_links'],
            fix_sdfields=self.options['html_cleaner_fix_sd_fields'],
        )
        with codecs.open(src_path, 'wb', 'utf-8') as fd:
            fd.write(new_html)
        # Rename images
        self.rename_img_files(src_dir, img_name_map)
        return src_path, metadata

    def rename_img_files(self, src_dir, img_name_map):
        for old_img, new_img in img_name_map.items():
            old_path = os.path.join(src_dir, old_img)
            new_path = os.path.join(src_dir, new_img)
            if not os.path.isfile(old_path):
                # XXX: Update error messages
                continue
            if os.path.exists(new_path):
                # XXX: Update error messages
                continue
            shutil.move(old_path, new_path)
        return
示例#6
0
class OOConvProcessor(BaseProcessor):
    """A processor that converts office docs into different formats.

    XXX: we could support far more options. See

         http://wiki.services.openoffice.org/wiki/API/Tutorials/
                PDF_export#How_to_use_it_from_OOo_Basic

         only for a list of PDF export options.
    """
    prefix = 'oocp'

    #: mapping: extension <-> format (as accepted by unoconv)
    formats = OUTPUT_FORMATS

    options = {}

    args = [
        Argument(
            '-oocp-out-fmt',
            '--oocp-output-format',
            choices=OUTPUT_FORMATS.keys(),
            default='html',
            help=('Output format to create via LibreOffice.'
                  'Pick from: %s' % ', '.join(OUTPUT_FORMATS.keys())),
            metavar='FORMAT',
        ),
        Argument(
            '-oocp-pdf-version',
            '--oocp-pdf-version',
            type=boolean,
            default=False,
            metavar='YES|NO',
            help='Create versioned PDF (aka PDF/A)? Default: no',
        ),
        Argument(
            '-oocp-pdf-tagged',
            '--oocp-pdf-tagged',
            type=boolean,
            default=False,
            metavar='YES|NO',
            help='Create tagged PDF document? Default: no',
        ),
        Argument('-oocp-host',
                 '--oocp-hostname',
                 default='localhost',
                 help='Host to contact for LibreOffice document '
                 'conversion. Default: "localhost"'),
        Argument(
            '-oocp-port',
            '--oocp-port',
            type=int,
            default=2002,
            help='Port of host to contact for LibreOffice document '
            'conversion. Default: 2002',
        ),
    ]

    def _get_filter_props(self):
        props = []
        if self.options['oocp_output_format'] == 'pdf':
            pdf_version = self.options['oocp_pdf_version'] and '1' or '0'
            props.append(("SelectPdfVersion", pdf_version))
            pdf_tagged = self.options['oocp_pdf_tagged'] and '1' or '0'
            props.append(("UseTaggedPDF", pdf_tagged))
        return props

    def process(self, path, metadata):
        basename = os.path.basename(path)
        src = os.path.join(copy_to_secure_location(path), basename)
        if os.path.isfile(path):
            path = os.path.dirname(path)
        shutil.rmtree(path)
        extension = self.options['oocp_output_format']
        filter_name = self.formats[extension]
        url = 'socket,host=%s,port=%d;urp;StarOffice.ComponentContext' % (
            self.options['oocp_hostname'], self.options['oocp_port'])

        filter_props = self._get_filter_props()
        status, result_path = convert(
            url=url,
            out_format=filter_name,
            filter_props=filter_props,
            path=src,
            out_dir=os.path.dirname(src),
        )
        metadata['oocp_status'] = status
        if status != 0:
            metadata['error'] = True
            metadata['error-descr'] = 'conversion problem'
            if os.path.isfile(src):
                src = os.path.dirname(src)
            shutil.rmtree(src)
            return None, metadata
        if extension == 'xhtml':
            extension = 'html'
        result_path = '%s.%s' % (os.path.splitext(src)[0], extension)

        # Remove input file if different from output
        if os.path.exists(src):
            if os.path.basename(result_path) != basename:
                os.unlink(src)
        return result_path, metadata
示例#7
0
class MetaProcessor(BaseProcessor):
    """The meta processor handles general workflow.

    When getting certain options, it constructs a pipeline of document
    processors.

    The :class:`MetaProcessor` is a kind of processor dispatcher that
    finds, setups and calls all requested processors in the requested
    order.

    """
    #: the meta processor is named 'meta'
    prefix = 'meta'

    #: We support a ``-meta-procord`` option which stands for
    #: ``processororder``. The current default order is:
    #: ``'unzip,oocp,zip'`` which means: maybe unzip the input, then
    #: convert it into HTML and afterwards zip the results.
    args = [
        Argument(
            '-meta-procord',
            '--meta-processor-order',
            default=string_to_stringtuple(DEFAULT_PROCORDER),
            type=processor_order,
            help='Comma-separated list of processors to run. '
            'Default: "%s"' % DEFAULT_PROCORDER,
            metavar='PROC_LIST',
        ),
    ]

    @property
    def avail_procs(self):
        return get_entry_points('ulif.openoffice.processors')

    def __init__(self, options={}):
        from ulif.openoffice.options import Options
        if not isinstance(options, Options):
            options = Options(string_dict=options)
        self.all_options = options
        self.options = options
        self.metadata = {}
        return

    def process(self, input=None, metadata={'error': False}):
        """Run all processors defined in options.

        If all processors run successful, the output of the last along
        with (maybe modified) metadata is returned.

        Each processor is fed with the `metadata` dict and an `input`
        (normally a filepath). Feeding a processor means to call its
        `process` method.

        If a processor sets the ``error`` entry of `metadata` to
        ``True`` this indicates some problem and the whole process is
        aborted returning ``None`` as output and the `metadata`, maybe
        containing some smart hints about the reasons.

        If all processors work correctly, the output of the last
        processor is returned along with the last `metadata`.

        The set and order of processors called depends on the
        ``procord`` option passed in. If this option is set to some
        value like ``oocp,oocp`` then the ``oocp`` processor (which is
        the :class:`OOConvProcessor`, registered under ``oocp`` in
        `setup.py`) is called two times.

        .. note:: after each processing, the (then old) input is
                  removed.
        """
        metadata = metadata.copy()
        pipeline = self._build_pipeline()
        output = None

        for processor in pipeline:
            proc_instance = processor(self.all_options)
            output, metadata = proc_instance.process(input, metadata)
            if metadata['error'] is True:
                metadata = self._handle_error(processor, input, output,
                                              metadata)
                return None, metadata
            if input != output:
                remove_file_dir(input)
            input = output
        return input, metadata

    def _handle_error(self, proc, input, output, metadata):
        metadata['error-descr'] = metadata.get(
            'error-descr', 'problem while processing %s' % proc.prefix)
        remove_file_dir(input)
        remove_file_dir(output)
        return metadata

    def _build_pipeline(self):
        """Build a pipeline of processors according to options.
        """
        result = []
        procs = self.avail_procs
        for proc_name in self.options['meta_processor_order']:
            result.append(procs[proc_name])
        return tuple(result)