Exemplo n.º 1
0
    def from_file(cls, xmlfile, *args, **kwargs):
        desc_tag = None
        try:
            iter = etree.iterparse(xmlfile, ['start', 'end'])
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'start':
                    desc_tag = element
                    break

            if desc_tag is None:
                raise NoDublinCore("DublinCore section not found. \
                    Check if there are rdf:RDF and rdf:Description tags.")

            # continue 'till the end of RDF section
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'end':
                    break

            # if there is no end, Expat should yell at us with an ExpatError

            # extract data from the element and make the info
            return cls.from_element(desc_tag, *args, **kwargs)
        except XMLSyntaxError as e:
            raise ParseError(e)
        except ExpatError as e:
            raise ParseError(e)
Exemplo n.º 2
0
    def from_file(cls, xmlfile, parse_dublincore=True, image_store=None):

        # first, prepare for parsing
        if isinstance(xmlfile, basestring):
            file = open(xmlfile, 'rb')
            try:
                data = file.read()
            finally:
                file.close()
        else:
            data = xmlfile.read()

        if not isinstance(data, unicode):
            data = data.decode('utf-8')

        data = data.replace(u'\ufeff', '')

        # assume images are in the same directory
        if image_store is None and getattr(xmlfile, 'name', None):
            image_store = ImageStore(path.dirname(xmlfile.name))

        try:
            parser = etree.XMLParser(remove_blank_text=False)
            tree = etree.parse(StringIO(data.encode('utf-8')), parser)

            me = cls(tree,
                     parse_dublincore=parse_dublincore,
                     image_store=image_store)
            me.load_frame_info()
            return me
        except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
            raise ParseError(e)
Exemplo n.º 3
0
def transform(wldoc, stylesheet='legacy', options=None, flags=None):
    """Transforms the WL document to XHTML.

    If output_filename is None, returns an XML,
    otherwise returns True if file has been written,False if it hasn't.
    File won't be written if it has no content.
    """
    # Parse XSLT
    try:
        style_filename = get_stylesheet(stylesheet)
        style = etree.parse(style_filename)

        document = copy.deepcopy(wldoc)
        del wldoc
        document.swap_endlines()

        if flags:
            for flag in flags:
                document.edoc.getroot().set(flag, 'yes')

        document.clean_ed_note()
        document.clean_ed_note('abstrakt')

        if not options:
            options = {}
        options.setdefault('gallery', "''")
        result = document.transform(style, **options)
        del document  # no longer needed large object :)

        if html_has_content(result):
            add_anchors(result.getroot())
            add_table_of_themes(result.getroot())
            add_table_of_contents(result.getroot())

            return OutputFile.from_bytes(
                etree.tostring(result,
                               method='html',
                               xml_declaration=False,
                               pretty_print=True,
                               encoding='utf-8'))
        else:
            return None
    except KeyError:
        raise ValueError("'%s' is not a valid stylesheet.")
    except (XMLSyntaxError, XSLTApplyError) as e:
        raise ParseError(e)
Exemplo n.º 4
0
    def from_file(cls, xmlfile, *args, **kwargs):

        # first, prepare for parsing
        if isinstance(xmlfile, basestring):
            file = open(xmlfile, 'rb')
            try:
                data = file.read()
            finally:
                file.close()
        else:
            data = xmlfile.read()

        if not isinstance(data, unicode):
            data = data.decode('utf-8')

        data = data.replace(u'\ufeff', '')

        try:
            parser = etree.XMLParser(remove_blank_text=False)
            tree = etree.parse(StringIO(data.encode('utf-8')), parser)

            return cls(tree, *args, **kwargs)
        except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
            raise ParseError(e)
Exemplo n.º 5
0
class WorkInfo(object):
    __metaclass__ = DCInfo

    FIELDS = (
        Field(DCNS('creator'),
              'authors',
              as_person,
              salias='author',
              multiple=True),
        Field(DCNS('title'), 'title'),
        Field(DCNS('type'), 'type', required=False, multiple=True),
        Field(DCNS('contributor.editor'),
              'editors',
              as_person,
              salias='editor',
              multiple=True,
              default=[]),
        Field(DCNS('contributor.technical_editor'),
              'technical_editors',
              as_person,
              salias='technical_editor',
              multiple=True,
              default=[]),
        Field(DCNS('contributor.funding'),
              'funders',
              salias='funder',
              multiple=True,
              default=[]),
        Field(DCNS('contributor.thanks'), 'thanks', required=False),
        Field(DCNS('date'), 'created_at'),
        Field(DCNS('date.pd'),
              'released_to_public_domain_at',
              as_date,
              required=False),
        Field(DCNS('publisher'), 'publisher', multiple=True),
        Field(DCNS('language'), 'language'),
        Field(DCNS('description'), 'description', required=False),
        Field(DCNS('source'), 'source_name', required=False),
        Field(DCNS('source.URL'), 'source_url', required=False),
        Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict),
        Field(DCNS('rights.license'), 'license', required=False),
        Field(DCNS('rights'), 'license_description'),
        Field(PLMETNS('digitisationSponsor'),
              'sponsors',
              multiple=True,
              default=[]),
        Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
        Field(WLNS('developmentStage'), 'stage', required=False),
    )

    @classmethod
    def from_string(cls, xml, *args, **kwargs):
        from StringIO import StringIO
        return cls.from_file(StringIO(xml), *args, **kwargs)

    @classmethod
    def from_file(cls, xmlfile, *args, **kwargs):
        desc_tag = None
        try:
            iter = etree.iterparse(xmlfile, ['start', 'end'])
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'start':
                    desc_tag = element
                    break

            if desc_tag is None:
                raise NoDublinCore("DublinCore section not found. \
                    Check if there are rdf:RDF and rdf:Description tags.")

            # continue 'till the end of RDF section
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'end':
                    break

            # if there is no end, Expat should yell at us with an ExpatError

            # extract data from the element and make the info
            return cls.from_element(desc_tag, *args, **kwargs)
        except XMLSyntaxError, e:
            raise ParseError(e)
        except ExpatError, e:
            raise ParseError(e)
Exemplo n.º 6
0
def transform(wldoc,
              verbose=False,
              save_tex=None,
              morefloats=None,
              cover=None,
              flags=None,
              customizations=None,
              ilustr_path='',
              latex_dir=False):
    """ produces a PDF file with XeLaTeX

    wldoc: a WLDocument
    verbose: prints all output from LaTeX
    save_tex: path to save the intermediary LaTeX file to
    morefloats (old/new/none): force specific morefloats
    cover: a cover.Cover factory or True for default
    flags: less-advertising,
    customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
    """

    # Parse XSLT
    try:
        book_info = wldoc.book_info
        document = load_including_children(wldoc)
        root = document.edoc.getroot()

        if cover:
            if cover is True:
                cover = make_cover
            bound_cover = cover(book_info, width=1200)
            root.set('data-cover-width', str(bound_cover.width))
            root.set('data-cover-height', str(bound_cover.height))
            if bound_cover.uses_dc_cover:
                if book_info.cover_by:
                    root.set('data-cover-by', book_info.cover_by)
                if book_info.cover_source:
                    root.set('data-cover-source', book_info.cover_source)
        if flags:
            for flag in flags:
                root.set('flag-' + flag, 'yes')

        # check for LaTeX packages
        if morefloats:
            root.set('morefloats', morefloats.lower())
        elif package_available('morefloats', 'maxfloats=19'):
            root.set('morefloats', 'new')

        # add customizations
        if customizations is not None:
            root.set('customizations', u','.join(customizations))

        # add editors info
        editors = document.editors()
        if editors:
            root.set(
                'editors',
                u', '.join(sorted(editor.readable() for editor in editors)))
        if document.book_info.funders:
            root.set('funders', u', '.join(document.book_info.funders))
        if document.book_info.thanks:
            root.set('thanks', document.book_info.thanks)

        # hack the tree
        move_motifs_inside(document.edoc)
        hack_motifs(document.edoc)
        parse_creator(document.edoc)
        substitute_hyphens(document.edoc)
        fix_hanging(document.edoc)
        fix_tables(document.edoc)
        mark_subauthors(document.edoc)

        # wl -> TeXML
        style_filename = get_stylesheet("wl2tex")
        style = etree.parse(style_filename)
        functions.reg_mathml_latex()

        # TeXML -> LaTeX
        temp = mkdtemp('-wl2pdf')

        for ilustr in document.edoc.findall("//ilustr"):
            shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)

        for sponsor in book_info.sponsors:
            ins = etree.Element("data-sponsor", name=sponsor)
            logo = sponsor_logo(sponsor)
            if logo:
                fname = 'sponsor-%s' % os.path.basename(logo)
                shutil.copy(logo, os.path.join(temp, fname))
                ins.set('src', fname)
            root.insert(0, ins)

        if book_info.sponsor_note:
            root.set("sponsor-note", book_info.sponsor_note)

        texml = document.transform(style)

        if cover:
            with open(os.path.join(temp, 'cover.png'), 'w') as f:
                bound_cover.save(f, quality=80)

        del document  # no longer needed large object :)

        tex_path = os.path.join(temp, 'doc.tex')
        fout = open(tex_path, 'wb')
        process(six.BytesIO(texml), fout, 'utf-8')
        fout.close()
        del texml

        if save_tex:
            shutil.copy(tex_path, save_tex)

        # LaTeX -> PDF
        shutil.copy(get_resource('pdf/wl.cls'), temp)
        shutil.copy(get_resource('res/wl-logo.png'), temp)

        if latex_dir:
            return temp

        try:
            cwd = os.getcwd()
        except OSError:
            cwd = None
        os.chdir(temp)

        # some things work better when compiled twice
        # (table of contents, [line numbers - disabled])
        for run in range(2):
            if verbose:
                p = call(['xelatex', tex_path])
            else:
                p = call(['xelatex', '-interaction=batchmode', tex_path],
                         stdout=PIPE,
                         stderr=PIPE)
            if p:
                raise ParseError("Error parsing .tex file")

        if cwd is not None:
            os.chdir(cwd)

        output_file = NamedTemporaryFile(prefix='librarian',
                                         suffix='.pdf',
                                         delete=False)
        pdf_path = os.path.join(temp, 'doc.pdf')
        shutil.move(pdf_path, output_file.name)
        shutil.rmtree(temp)
        return OutputFile.from_filename(output_file.name)

    except (XMLSyntaxError, XSLTApplyError) as e:
        raise ParseError(e)