Exemplo n.º 1
0
    def __init__(self,
            file=None,     # Document file name
            mimetype='',   # Mimetype string
            content='',    # Content data (the text)
            manifest='',   # Lists the contents of the ODF file
            meta='',       # Metadata
            styles='',     # Formatting data
            settings='',   # Application-specific data
            additional={}, # Additional bundled files (e.g. images)
            file_dates={}  # File dates for all files and directories
            ):

        # Get all method parameters
        args = locals()

        # Pass XML components to corresponding constructors
        self.content = Content(content)
        self.manifest = Manifest(manifest)
        self.meta = Meta(meta)
        self.settings = Settings(settings)
        self.styles = Styles(styles)

        # Remaining components don't need any conversion
        self.file = file
        self.mimetype = mimetype
        self.additional = additional
        self.file_dates = file_dates
Exemplo n.º 2
0
class Document(object):
    """ The ODF document class -- object model and associated methods.

    Contains the document in memory and is used as the intermediate step for 
    conversions and transformations.

    This implementation uses the ElementTree module to create and navigate the
    object. This is built into Python 2.5 and available separately as a standalone
    module.

    """

    def __init__(self,
            file=None,     # Document file name
            mimetype='',   # Mimetype string
            content='',    # Content data (the text)
            manifest='',   # Lists the contents of the ODF file
            meta='',       # Metadata
            styles='',     # Formatting data
            settings='',   # Application-specific data
            additional={}, # Additional bundled files (e.g. images)
            file_dates={}  # File dates for all files and directories
            ):

        # Get all method parameters
        args = locals()

        # Pass XML components to corresponding constructors
        self.content = Content(content)
        self.manifest = Manifest(manifest)
        self.meta = Meta(meta)
        self.settings = Settings(settings)
        self.styles = Styles(styles)

        # Remaining components don't need any conversion
        self.file = file
        self.mimetype = mimetype
        self.additional = additional
        self.file_dates = file_dates

    # Get non-XML components from the document

    def get_embedded(self, filter=None, ignore_case=False):
        """Return a dictionary of the objects embedded in the document.

        By default, this should return all embedded objects; the
        list/dictionary can also be filtered for a certain type, e.g. image
        files.

        The filter currently supports UNIX glob patterns like "*a[bc]?.png"
        and/or correct regular expressions like ".*a[bc].\.png$".

        """
        # TODO: support other embedded objects
        search = get_search_for_filter(filter, ignore_case)
        return dict([(filename[9:], data)
                    for filename, data in self.additional.items()
                    if 'Pictures/' == filename[:9]
                    and search(filename[9:])])

    def get_extension(self):
        """Return ODF extension for given mimetype."""
        return get_extension(self.mimetype)

    # Convert the document to other formats

    def tostring(self, key="content", encoding="utf-8"):
        """Get the XML representation of the given component."""
        comp = getattr(self, key)
        if isinstance(comp, str):
            return comp.encode(encoding)
        else:
            return comp.tostring(encoding=encoding)

    def totext(self, skip_blank_lines=True):
        """Return the content of the document as a plain-text Unicode string.

        Included here as well as in self.content to resemble to_html's usage.

        """
        return self.content.totext()

    def tohtml(self, title="", encoding="utf-8"):
        """Return an UTF-8 encoded HTML representation of the document."""
        # TODO: 
        # First, convert to ET operations
        # Then,
        # - Scrape up meta tags and add to headnode
        #     '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
        #     '<meta type="Generator" content="python-odftools" />'
        # - Title for the page, if applicable
        # - Convert self.styles to CSS and add to headnode as a <style type="text/css"> element
        #     - see cssutils at the Python cheeseshop
        # - Fix the unit test
        #
        # ENH: 
        # - Support encodings other than UTF-8, and maybe Unicode
        # - Allow named elements
        # - A more natural way of doing the doctype declaration, if possible

        attrs_odf2html = {"style-name": "class"}
        tags_odf2html = { 
                "a": "a",
                "body": "body",
                "p": "p",
                "span": "span",
                "table": "table",
                "h": "h1",
                "table-row": "tr",
                "table-cell": "td",
                "image": "img",
                "list": "ol",
                "list-item": "li" }

        htmldoc = ET.Element("html")
        headnode = ET.SubElement(htmldoc, "head")
        titlenode = ET.SubElement(headnode, "title")
        titlenode.text = title
        # ENH: add meta etc. nodes to the head as needed

        docbody = self.content.root.find("office:body")
        if docbody:
            bodynode = translate_nodes(docbody, tags_odf2html, attrs_odf2html)
        else:
            bodynode = ET.SubElement(htmldoc, "body")

        doctypestr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\n'
        htmlstr = ET.tostring(htmldoc, encoding=encoding)
        return "\n".join((doctypestr, htmlstr))

    # Operations

    def replace(self, search, replace, key="content"):
        return getattr(self, key).replace(search, replace)