def convert(self, cache_key=None): """Convert the document to HTML. Returns the main document content as string and a cache_key for quick later retrieval. Additional documents (images, etc.) which are result of the conversion are placed in the `tmpdir` of this `Document`. If `cache_key` is given (and a `cache_dir` set before) we will lookup the cache before performing any real conversion. Raises `IOError` if conversion fails. """ name = self.name() src_path = os.path.join(self.tmpdir, name) resultpath = self.client.get_cached(cache_key) if resultpath is not None: # Lookup cached doc by cache key (fast) newdir = copy_to_secure_location(resultpath) resultpath = os.path.join(newdir, os.path.basename(resultpath)) if resultpath is None: # Lookup cached doc by source (expensive) resultpath, cache_key = self.client.get_cached_by_source( src_path, OPTIONS_HTML) if resultpath is not None: newdir = copy_to_secure_location(resultpath) resultpath = os.path.join(newdir, os.path.basename(resultpath)) if resultpath is None: # Convert to HTML, new doc will be in resultpath resultpath, cache_key, metadata = self.client.convert( src_path, OPTIONS_HTML) if metadata['error']: descr = metadata.get('error-descr', 'Descr. not avail.') raise IOError('Could not convert: %s [%s]' % (name, descr)) newdir = os.path.dirname(resultpath) html = open(resultpath, 'r').read() self.cleanDir(self.tmpdir) self.tmpdir = newdir return html, cache_key
def convert_doc(src_doc, options, cache_dir): """Convert `src_doc` according to the other parameters. `src_doc` is the path to the source document. `options` is a dict of options for processing, passed to the processors. `cache_dir` may be ``None`` in which no caching is requested during processing. Generates a converted representation of `src_doc` by calling :class:`ulif.openoffice.processor.MetaProcessor` with `options` as parameters. Afterwards the conversion result is stored in cache (if allowed/possible) for speedup of upcoming requests. Returns a triple: ``(<PATH>, <CACHE_KEY>, <METADATA>)`` where ``<PATH>`` is the path to the resulting document, ``<CACHE_KEY>`` an identifier (string) to retrieve a generated doc from cache on future requests, and ``<METADATA>`` is a dict of values returned during request (and set by the document processors, notably setting the `error` keyword). If errors happen or caching is disabled, ``<CACHE_KEY>`` is ``None``. """ result_path = None cache_key = None repr_key = get_marker(options) # Create unique marker out of options metadata = dict(error=False) # Generate result input_copy_dir = copy_to_secure_location(os.path.abspath(src_doc)) input_copy = os.path.join(input_copy_dir, os.path.basename(src_doc)) try: proc = MetaProcessor(options=options) # Removes original doc result_path, metadata = proc.process(input_copy) except Exception, exc: shutil.rmtree(input_copy_dir) raise exc
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) new_html, img_name_map = cleanup_html( codecs.open(src_path, 'r', 'utf-8').read(), basename, fix_head_nums=self.options['html_cleaner_fix_heading_numbers'], fix_img_links=self.options['html_cleaner_fix_image_links'], fix_sdfields=self.options['html_cleaner_fix_sd_fields'], ) with codecs.open(src_path, 'wb', 'utf-8') as fd: fd.write(new_html) # Rename images self.rename_img_files(src_dir, img_name_map) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) new_html, img_name_map = cleanup_html( codecs.open(src_path, 'r', 'utf-8').read(), basename, fix_head_nums=self.options['html_cleaner_fix_heading_numbers'], fix_img_links=self.options['html_cleaner_fix_image_links'], fix_sdfields=self.options['html_cleaner_fix_sd_fields'], ) with codecs.open(src_path, 'wb', 'utf-8') as fd: fd.write(new_html) # Rename images self.rename_img_files(src_dir, img_name_map) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) # Remove <SDFIELD> tags if any cleaned_html = rename_sdfield_tags( open(src_path, 'rb').read().decode('utf-8')) with open(src_path, 'wb') as fd: fd.write(cleaned_html.encode('utf-8')) error_file = os.path.join(src_dir, 'tidy-errors') cmd = 'tidy -asxhtml -clean -indent -modify -utf8 -f %s %s' % ( error_file, src_path) os.system(cmd) os.unlink(error_file) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) # Remove <SDFIELD> tags if any cleaned_html = rename_sdfield_tags( open(src_path, 'rb').read().decode('utf-8')) with open(src_path, 'wb') as fd: fd.write(cleaned_html.encode('utf-8')) error_file = os.path.join(src_dir, 'tidy-errors') cmd = 'tidy -asxhtml -clean -indent -modify -utf8 -f %s %s' % ( error_file, src_path) os.system(cmd) os.unlink(error_file) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) remove_file_dir(path) new_html, css = extract_css( open(src_path, 'rb').read().decode('utf-8'), basename, prettify_html=self.options['css_cleaner_prettify_html']) css, errors = cleanup_css( css, minified=self.options['css_cleaner_minified']) css_file = os.path.splitext(src_path)[0] + '.css' if css is not None: with open(css_file, 'wb') as fd: fd.write(css.encode('utf-8')) with open(src_path, 'wb') as fd: fd.write(new_html.encode('utf-8')) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) remove_file_dir(path) new_html, css = extract_css( open(src_path, 'rb').read().decode('utf-8'), basename, prettify_html=self.options['css_cleaner_prettify_html']) css, errors = cleanup_css( css, minified=self.options['css_cleaner_minified']) css_file = os.path.splitext(src_path)[0] + '.css' if css is not None: with open(css_file, 'wb') as fd: fd.write(css.encode('utf-8')) with open(src_path, 'wb') as fd: fd.write(new_html.encode('utf-8')) return src_path, metadata
def process(self, path, metadata): """Do PSJ-specific adaptions of generated HTML input. `path` gives any (beforehand) generated HTML document. The path might be located in a directory with additional files (images, etc.) that could also be processed. `metadata` is a dictionary of metadata concerning the conversion process. It contains at least a key ``error`` with a boolean value (should alway be `False`, otherwise the document conversion failed), and a key ``error-descr`` which contains some error message in case of failures. The ``error`` and ``error-descr`` should be set when unresolvable processing problems occur. Returns a tuple (``result_path``, ``metadata``) with ``result_path`` containing the path to the modified document and ``metadata`` containing the updated ``metadata`` directory passed in. """ ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) remove_file_dir(path) html = self.fix_html(open(src_path, 'r').read()) open(src_path, 'w').write(html.encode('utf-8')) css = self.get_css(os.path.dirname(src_path)) css = self.fix_css(css) open(os.path.join( os.path.dirname(src_path), 'psj.css'), 'w').write(css) return src_path, metadata
def process(self, path, metadata): basename = os.path.basename(path) src = os.path.join( copy_to_secure_location(path), basename) if os.path.isfile(path): path = os.path.dirname(path) shutil.rmtree(path) extension = self.options['oocp_output_format'] filter_name = self.formats[extension] url = 'socket,host=%s,port=%d;urp;StarOffice.ComponentContext' % ( self.options['oocp_hostname'], self.options['oocp_port']) filter_props = self._get_filter_props() status, result_path = convert( url=url, out_format=filter_name, filter_props=filter_props, path=src, out_dir=os.path.dirname(src), ) metadata['oocp_status'] = status if status != 0: metadata['error'] = True metadata['error-descr'] = 'conversion problem' if os.path.isfile(src): src = os.path.dirname(src) shutil.rmtree(src) return None, metadata if extension == 'xhtml': extension = 'html' result_path = '%s.%s' % (os.path.splitext(src)[0], extension) # Remove input file if different from output if os.path.exists(src): if os.path.basename(result_path) != basename: os.unlink(src) return result_path, metadata
def process(self, path, metadata): basename = os.path.basename(path) src = os.path.join(copy_to_secure_location(path), basename) if os.path.isfile(path): path = os.path.dirname(path) shutil.rmtree(path) extension = self.options['oocp_output_format'] filter_name = self.formats[extension] url = 'socket,host=%s,port=%d;urp;StarOffice.ComponentContext' % ( self.options['oocp_hostname'], self.options['oocp_port']) filter_props = self._get_filter_props() status, result_path = convert( url=url, out_format=filter_name, filter_props=filter_props, path=src, out_dir=os.path.dirname(src), ) metadata['oocp_status'] = status if status != 0: metadata['error'] = True metadata['error-descr'] = 'conversion problem' if os.path.isfile(src): src = os.path.dirname(src) shutil.rmtree(src) return None, metadata if extension == 'xhtml': extension = 'html' result_path = '%s.%s' % (os.path.splitext(src)[0], extension) # Remove input file if different from output if os.path.exists(src): if os.path.basename(result_path) != basename: os.unlink(src) return result_path, metadata
def test_copy_to_secure_location_path(self): sample_path = os.path.join(self.workdir, 'sample.txt') open(sample_path, 'wb').write("Hi from sample") sample_dir = os.path.dirname(sample_path) self.resultpath = copy_to_secure_location(sample_dir) assert os.path.isfile(os.path.join(self.resultpath, 'sample.txt'))
def test_copy_to_secure_location_path(self, workdir): # we can copy dirs to a secure location workdir.join("src").join("sample.txt").write("Hey there!") result_path = copy_to_secure_location(str(workdir / "src")) assert os.path.isfile(os.path.join(result_path, 'sample.txt'))