Exemplo n.º 1
0
def _coalesce_blocks(attrs, blocks):
    B = Var('B')
    _ = Var('_')
    blocks = list(blocks)
    _debug = blocks[:]

    def next_body():
        return blocks.pop(0)[2] if blocks else []

    while True:
        body = next_body()
        if not body:
            break
        pre_block = []
        while body and body == [('code', {}, B)]:
            pre_block.append(plaintextify(B.val) + '\n')
            body = next_body()
        if pre_block:
            pre_block = mkel('pre', {}, pre_block)
            yield pre_block
        non_pre_block = []
        while body and body != [('code', {}, B)]:
            is_citation = 'right' in attrs.get('class', [])
            if is_citation:
                non_pre_block.append(
                    mkel('footer', {}, [mkel('cite', {}, body)]))
            else:
                if needs_wrapping_in_p(body):
                    body = [mkel('p', {}, body)]
                non_pre_block.extend(body)
            body = next_body()

        if non_pre_block:
            yield mkel('blockquote', {}, tidy(non_pre_block))
Exemplo n.º 2
0
def needs_wrapping_in_p(body):
    REAL_BLOCK_TAG = Var('REAL_BLOCK_TAG',
                         lambda e: e in BLOCK_TAGS and e != '.footnote')
    _ = Var('_')
    if body == [(REAL_BLOCK_TAG, _, _)]:
        return False
    else:
        # if we're too eager to wrap things in p's then hopefully a subsequent
        # tidy pass will remove them
        return True
Exemplo n.º 3
0
def hacky_flatten_block(block):
    # XXX(ash): move to postprocess
    # pylint: disable=C0103
    BLOCK_ATTRS = Var('BLOCK_ATTRS')
    P_ATTRS = Var('P_ATTRS')
    BODY = Var('BODY')
    if block == ('.block', BLOCK_ATTRS, [('p', P_ATTRS, BODY)]):
        return mkel('.block', merge_attrs(BLOCK_ATTRS.val, P_ATTRS.val),
                    BODY.val)
    else:
        return block
Exemplo n.º 4
0
def _pop_dl_meta(body, head):
    """Pops ``<dl>`` encoded metadata from `body` and stuffs it into `head`."""
    DL_BODY = Var('DL_BODY')
    if body == Seq[('dl', {'id': 'document-properties'}, DL_BODY), :]:
        del body[0]
        dl_body = space_normalize(DL_BODY.val)
        DD_BODY, ATTRS = map(Var, 'DD_BODY, ATTRS'.split(', '))
        DT = ('dt', Var('_'), Var('_'))
        DD = ('dd', ATTRS, DD_BODY)
        for dt_dd in zip(dl_body[::2], dl_body[1::2]):
            assert (DT, DD) == dt_dd
            c, = ATTRS.val['class']
            head[c] = ATTRS.val.get('data-value', plaintextify(DD_BODY.val))
Exemplo n.º 5
0
def _tidy_heading(tag, attrs, body):
    # is there any actual textual content in the heading?
    cleansed = tidy(whack(lambda e: e not in CAN_OCCUR_IN_H, body))
    if not blank(whack(lambda e: e in ('img', 'figure'), cleansed)):
        yield tag, {k: v
                    for k, v in attrs.iteritems() if k != 'style'}, cleansed
        return
    # no, so it's not really a heading
    # but maybe it contains some misformatted figures or similar
    # so yield the contents that aren't anchors or whitespace strings
    _, _STRING = Var('_'), Var('_STRING', isinstance, basestring)
    for x in cleansed:
        if x != ('a', {'name': _}, []) and x != _STRING:
            yield x
def test_make_cover():
    dummy_image = literal.Image('', 'image/jpeg', OrderedDict())
    assert make_cover_page(src='SOME_HASH.jpg', title='Dummy Title') == \
'''<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>Dummy Title</title>
    <link href="css/stylesheet.css" rel="stylesheet" type="text/css"/>
  </head>
  <body>
    <div class="cover-page">
      <img alt="Dummy Title" src="SOME_HASH.jpg" title="Dummy Title"/>
    </div>
  </body>
</html>
'''
    assert make_cover_opf(dummy_image, src='SOME_HASH.jpg') == [
        ('item',
         {'href': Var('_', re.compile(r'.*\.jpg').match),
          'id': 'cover-image',
          'media-type': 'image/jpeg',
          'properties': 'cover-image'},
         []),
        ('item',
         {'href': 'cover.xhtml',
          'id': 'cover',
          'media-type': 'application/xhtml+xml'},
         [])]
Exemplo n.º 7
0
def parse_cites(parsed_body, bib_entries, collect_cite):  # pylint: disable=R0914
    ZURL = Var('ZURL', ZOTERO_ITEM_URL_REX.match)
    ZBODY = Var('ZBODY', lambda s: REF_KEY_REX.match(plaintextify(s)))
    ans = []
    coalesce_strings = False

    for e in parsed_body:
        if isinstance(e, basestring):
            ans.append(e)
        elif e == ('a', {'href': ZURL}, ZBODY):
            link_text = ZBODY.match.group(0).strip()
            # sp1 and sp2 are potential leading and trailing spaces which we
            # tolerate and move out of the link
            sp1, paren, key, field, post, sp2 = ZBODY.match.groups()
            if bool(paren) != link_text.endswith(']'):
                ans.append(
                    mkerr([e], 'malformed citation, unmatched %s' %
                          ('[]'[not paren])))
                continue
            zotero_id = 'http:' + ZURL.match.group(0).split(':', 1)[1]
            collect_cite(key)
            fields = bib_entries[key].fields if key in bib_entries else {}
            # XXX(alexander): cleanse_post_citation kinda takes rich-text, we
            # only do plaintext for now
            post_text, = cleanse_post_citation([post])
            if 'zoteroid' not in fields or fields.get('zoteroid') == zotero_id:
                if sp1:
                    ans.append(sp1)
                    coalesce_strings = True
                ans.append(cite(key, post_text, textual=not paren,
                                field=field))
                if sp2:
                    ans.append(sp2)
                    coalesce_strings = True
            else:
                ans.append(mkerr([e], 'bad citation key'))
        else:
            ans.append(
                mkel(e[0], e[1], parse_cites(e[2], bib_entries, collect_cite)))
    if coalesce_strings:
        i = 1
        while i < len(ans):
            if (isinstance(ans[i], basestring)
                    and isinstance(ans[i - 1], basestring)):
                ans[i - 1:i + 1] = [ans[i - 1] + ans[i]]
            i += 1
    return ans
Exemplo n.º 8
0
    def handle_emphasis(self, emph, body):
        r"""Boldens italicizes or strikes-through latex text.

        Harder than it sounds: The problem being that \textbf and \textit
        don't work across paragraphs and \bfseries and \itshape don't do
        italic correction (i.e. the end of the emphasized text juts into what
        follows it, because the space is not widened as necessary).

        >>> writer = LatexWriter()
        >>> print writer.handle_emphasis('b', ['some bold text'])
        \textbf{some bold text}
        >>> print writer.handle_emphasis(
        ...     'b', [('p', {}, [('i', {},  ['some bold italic'])]), 'text'])
        {\bfseries{}\textit{some bold italic}
        <BLANKLINE>
        text\/}
        >>>

        With strikethrough and underline the problem is even worse. TeX itself
        has no underline/strikethrough at all and the default LaTeX \underline
        command is broken (e.g. makes the text un(line)breakable). All
        replacements like soul's \ul and ulem's \uline have weird limitations
        that cause random breakage, so we push these styles down into the body
        recursively.

        >>> print writer.handle_emphasis(
        ...  'u', [('p', {}, [('i', {},
        ...                     [('b', {}, ['ul bold italic'])])]), 'text'])
        {\itshape{}{\bfseries{}\uline{ul bold italic}\/}\/}
        <BLANKLINE>
        \uline{text}

        """
        # can safely use \textit/\textbf etc.
        INLINE_TEXT = Var(
            'INLINE_TEXT',  # pylint: disable=C0103
            lambda x: isinstance(x, basestring) and '\n' not in x)
        if body == [INLINE_TEXT]:
            return cmd(self.INLINE_EMPH_TO_LATEX[emph], [],
                       [self.latexify(body)])
        else:
            if emph in ('b', 'i'):
                # need to use itshape/bfseries and do italic correction (r'\/')
                return texcmd(
                    dict(b='bfseries', i='itshape')[emph],
                    join(self.latexify(body), r'\/'))
            else:
                assert emph in ('u', 's')
                # XXX: it might be better to have latexify as the outmost call
                # here rather than join indivudally converted parts. That would
                # allow for further rewrite logic in other parts of the latex
                # converter.
                return join(*(self.handle_emphasis(emph, [e]) if isinstance(
                    e, basestring) else self.latexify(
                        mkel(*e[:2],
                             body=[
                                 mkel(emph, {}, [subbody_part])
                                 for subbody_part in e[2]
                             ])) for e in body))
Exemplo n.º 9
0
def extract_labels(body):
    HREF = Var('HREF')
    labels = []
    newbody = []
    for e in body:
        if e == ('a', {'name': HREF}, []):
            labels.append(HREF.val.lstrip('#'))
        else:
            newbody.append(e)
    return labels, newbody
Exemplo n.º 10
0
def _space_normalize(es,
                     lstrip=False,
                     rstrip=False,
                     parent_was_block_el=False):
    REAL_BLOCK_TAG = Var('REAL_BLOCK_TAG',
                         lambda e: e in BLOCK_TAGS and e != '.footnote')
    _ = Var('_')
    ans = []
    n = len(es)
    for i, e in enumerate(es):
        new_e, lstrip = _space_normalize1(
            e,
            # NB: the parenthesization difference is intentional
            lstrip=lstrip or parent_was_block_el and i == 0,
            rstrip=(rstrip or parent_was_block_el) and i == n - 1
            or (es[i + 1:i + 2] == [(REAL_BLOCK_TAG, _, _)]))
        if new_e:
            ans.append(new_e)
    return ans, lstrip
Exemplo n.º 11
0
def unwrap_figures(body):
    # XXX: this currently only operates at the toplevel, both looking for
    # paragraphs and also looking for block figures in paragraphs. Strictly
    # speaking we should probably descend for both. As an additional hack, we
    # descend, up to the the <td> level, into tables.
    FATTRS, PATTRS, FBODY = map(Var, 'FATTRS, PATTRS, FBODY'.split(', '))
    BLOCK_STYLE_ATTR = Var('BLOCK_STYLE_ATTR',
                           lambda a: a['style']['display'] == 'block')
    BLOCK_FIG = ('figure', BLOCK_STYLE_ATTR, FBODY)
    PBODY_WITH_BLOCKFIG = Var('PBODY_WITH_BLOCKFIG', list.__contains__,
                              BLOCK_FIG)
    for elem in body:
        if elem and elem[0] in ('table', 'tr', 'td', 'blockquote'):
            yield mkel(elem[0], elem[1], list(unwrap_figures(elem[-1])))
        elif elem in (('p', {}, [('figure', FATTRS, FBODY)]), ('figure',
                                                               FATTRS, FBODY)):
            # override style of standalone figures
            new_fattrs = copy.deepcopy(FATTRS.val)
            new_fattrs['style']['display'] = 'block'
            yield mkel('figure', new_fattrs, FBODY.val)
        # Split a <p> that contains a block figure into
        # two paragraphs separated by a figure.
        # This case can only arise due to the
        # large inline image heuristic; if the paragraph
        # has an id attribute (shouldn't happen yet),
        # we put it into the first half of the split. We throw away
        # empty <p>s.
        elif elem == ('p', PATTRS, Seq[PBODY_WITH_BLOCKFIG:]):
            body = PBODY_WITH_BLOCKFIG.val
            i_fig = body.index(BLOCK_FIG)
            if body[:i_fig]:
                yield mkel('p', PATTRS.val, body[:i_fig])
                cloned_attrs = dict(
                    (k, v) for (k, v) in PATTRS.val.items() if k != 'id')
            else:
                cloned_attrs = PATTRS.val
            yield body[i_fig]
            if cloned_attrs or body[i_fig + 1:]:
                yield ('p', cloned_attrs, body[i_fig + 1:])
        else:
            yield elem
Exemplo n.º 12
0
def _as(format, node):
    PRE = Var('PRE')
    assert node == ('pre', {}, [PRE])
    s = PRE.val
    lang = _guess_lang(s)
    if format == 'html':
        formatter = HtmlFormatter()
    elif format == 'latex':
        formatter = LatexFormatter()
    else:
        raise RuntimeError('Not a valid output format: %r' % format)
    return highlight(s, get_lexer_by_name(lang), formatter)
Exemplo n.º 13
0
def coalesce(es):  # pylint: disable=R0912,R0914
    def grouper(thing):
        if isinstance(thing, basestring):
            return basestring
        else:
            return thing[:2]

    EMPTY_NON_VOID_ELEMENT = (Var('_', lambda tag: tag not in FULLY_VOID_TAGS),
                              {}, [])
    EMPTY_BLOCK_ELEMENT = (Var('_', NON_EMPTY_BLOCK_TAGS.__contains__),
                           Var('_'), [Var('_', blank)])
    EMPTY_LINK = ('a', Var('ATTRS', lambda a: 'name' not in a), [])
    BOGUS_ELEMENTS = (EMPTY_NON_VOID_ELEMENT, EMPTY_BLOCK_ELEMENT, EMPTY_LINK)
    for (tag_attrs, group) in groupby(es, grouper):
        if tag_attrs is basestring:
            yield nfc("".join(group))
        else:
            tag, attrs = tag_attrs
            if tag in INLINE_TAG or tag == 'blockquote':
                for x in _coalesce_siblings(tag, attrs, group):
                    if x not in BOGUS_ELEMENTS:
                        yield x
            elif tag == '.block':
                for x in _coalesce_blocks(attrs, group):
                    yield x
            # FIXME(alexander): don't simplify CMD and LIT contents for now...
            # ... this is needed because of the stupid representation of
            # citations, in particular
            elif tag in ('LIT', 'CMD'):
                for x in group:
                    yield x
            else:
                for x in (_coalesce_parent_child(parent) for parent in group):
                    if x in BOGUS_ELEMENTS:
                        continue
                    if tag in H_TAGS:
                        for y in _tidy_heading(*x):
                            yield y
                    else:
                        yield x
Exemplo n.º 14
0
def _coalesce_parent_child(parent):
    # the tidy below is
    tag, attrs, raw_body = parent
    body = tidy(raw_body)
    B = Var('B')
    # rationale:
    #     <li>
    #       <p>a</p>
    #       <ul>...</ul>
    #     </li>
    # should be transformed to:
    #     <li>
    #       a
    #       <ul>...</ul>
    #     </li>
    DOES_NOT_START_WITH_P = Var(
        'DOES_NOT_START_WITH_P',
        lambda elts: not any(is_p(elt) for elt in elts))
    BODY_WITH_BOGUS_P = Seq[('p', {}, B), DOES_NOT_START_WITH_P:]
    # google docs inserts paragraphs at the darnest places
    # unwrap singleton paragraphs where they don't belong
    # XXX(alexander): consider lifting p attributes
    # like justify class in comprehensive-test
    if tag in ('li', 'dt', 'dd', '.footnote') and body == BODY_WITH_BOGUS_P:
        body = B.val + DOES_NOT_START_WITH_P.val
    elif (tag, attrs) == ('p', {}) and body in ([('.pagebreak', {}, [])
                                                 ], [('blockquote', {}, B)]):
        (tag, attrs, body), = body
    else:
        LIFTABLE_SPAN_STYLE = Var(
            'LIFTABLE_SPAN_STYLE',
            lambda d: not (set(d) - ({'color', 'background-color'} - set(
                attrs.get('style', {})))))
        if body == [('span', {'style': LIFTABLE_SPAN_STYLE}, B)]:
            body = B.val
            attrs = _style_merge(attrs, LIFTABLE_SPAN_STYLE.val)
    return mkel(tag, attrs, body)
Exemplo n.º 15
0
def lift_code(para):
    def is_code(element):
        return element[:1] == ('code', )

    # pylint: disable=C0103
    ALL_CODE = Var('ALL_CODE', lambda xs: all(is_code(x) for x in xs))
    if para == ('p', {}, ALL_CODE):
        # XXX(ash): maybe should do this coalescing of adjacent `code`
        # bodies in postprocess?
        new_body = []
        for e in ALL_CODE.val:
            _, attrs, body = e
            if attrs:
                log.warn('ignoring attrs on code tag %r', e)
            new_body.extend(body)
        return mkel('code', {}, new_body)
    else:
        return para
Exemplo n.º 16
0
def _pop_title_and_subtitle(body, head):
    """Pops (sub)titles from `body`' and stuff them into ``head``."""
    _, BODY, REST = map(Var, '_, BODY, REST'.split(', '))
    for tag, alt_h in [('title', 'h1'), ('subtitle', 'h2')]:
        if body in (Seq[(tag, {}, Seq[BODY:]), REST:], Seq[(alt_h, {
                'class': tag
        }, Seq[BODY:]), REST:]):
            # XXX(alexander): plaintextification of (sub)titles
            title_str = space_normalize(plaintextify(BODY.val))
            if title_str:
                head[tag] = title_str
            del body[0]
        # skip empty paragraphs between title and subtitle and subtitle and meta
        while body and body[0] in [
                Var('_', lambda x: isinstance(x, basestring) and blank(x)),
            ('p', _, Seq(blank_flat_body)[:])
        ]:
            log.warn('Killing blank gunk before metadata')
            del body[0]
Exemplo n.º 17
0
    def handle_run(self, r):
        # XXX(ash): pylint is right about this being too complex
        # pylint: disable=R0912
        _ = Var('_')
        ans = []
        rPr = first_of_tag(r, RUN_PROPS_TAG)
        content = rPr.itersiblings() if rPr is not None else iter(r)
        for e in content:
            # pylint: disable=W0622
            type = e.attrib.get(ns.w('type'))
            if e.tag == TEXT_TAG:
                ans.append(e.text)
            elif e.tag == TAB_TAG:
                # XXX(alexander): this can also work like a '_' or '…' \dotfill
                ans.append('\t')
            elif e.tag in (FOOTNOTE_REF_TAG, ENDNOTE_REF_TAG):
                # XXX(ash): what is going on here
                pass
            elif e.tag == BREAK_TAG and type in ('page', 'column'):
                ans.append(mkel('.pagebreak', {}, []))
            elif e.tag == BREAK_TAG or e.tag == CR_TAG:
                assert (type is None) or (type == 'textWrapping')
                ans.append(mkel('br', {}, []))
            # FIXME, tags below untested
            elif e.tag == SOFT_HYPHEN_TAG:
                ans.append(SOFT_HYPHEN)
            elif e.tag == NON_BREAKING_HYPHEN_TAG:
                ans.append(NON_BREAKING_HYPHEN)
            elif e.tag == ns.w('drawing'):
                ans.extend(
                    flatmap(self.transclude,
                            e.xpath(self.IMAGE_XPATH, namespaces=ns.dict)))
            elif e.tag in (FOOTNOTE_REFERENCE_TAG, ENDNOTE_REFERENCE_TAG):
                ans.append(self.make_footnote(e))
            else:
                # movie,
                # rt, ruby, rubyAlign etc. for ruby stuff
                # sym, with special handling for wingdings I guess...
                log.warn('Unknown tag %r', e.tag)
        if rPr is not None and ans != Seq[Seq['.footnote', _:], _:]:
            ans = self.apply_rpr(rPr, ans)

        return ans
Exemplo n.º 18
0
 def build_list(cls, tree):
     _ = Var('_')
     if isinstance(tree, list):
         ans = []
         for (tag, attr), body in itertools.groupby(
                 tree, lambda x: (_, _) if isinstance(x, list) else x[0]):
             this_body = []
             if tag is _:
                 body, = body
                 ans.append(mkel('.block', {}, cls.build_list(body)))
             else:
                 for x in body:
                     if isinstance(x, list):
                         item = cls.build_list(x)
                         this_body[-1][2].extend(item)
                     else:
                         item = [x[1]]
                         this_body.append(mkel('li', {}, item))
                 ans.append(mkel(tag, attr, this_body))
     return ans
Exemplo n.º 19
0
    def extract_header(elems):
        attrs = []
        for elem in elems:
            _TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY = map(
                Var,
                "_TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY".split(', '))
            _BLANK_BODY = Var('_BLANK', blank)
            if elem == (_TAG, TATTRS, _BLANK_BODY):
                # empty cell - accept, but do not propagate attrs, apart
                # from background-color
                bg = TATTRS.val.get('style', {}).get('background-color')
                attrs.append({})
                if bg:
                    iadd_style(attrs[-1], 'background-color', bg)
            elif elem in ((_TAG, TATTRS, [
                ('p', PATTRS, [('span', _SATTRS, [('b', _BATTRS, TBODY)])])
            ]), (_TAG, TATTRS, [('p', PATTRS, [('b', _BATTRS, TBODY)])])):
                attrs.append(merge_attrs(TATTRS.val, PATTRS.val))
            else:
                #not header set
                return False, []

        return True, attrs
Exemplo n.º 20
0
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean,
                    bibliography):
    # pylint: disable=R0911,R0914,R0912,R0913,R0915
    # FIXME(alexander): clean this up a bit, and get rid of pylint muffles
    if isinstance(fragment, basestring):
        return cgi.escape(fragment)

    (tag, attrs, content) = fragment
    if tag in ['script', 'style'] and content:
        content_str, = content
        return NOT_INLINE_TEMPLATE % dict(
            indent=indent,
            tag=tag,
            attrs_str=encode_attrs(attrs, transclusions, epub_clean),
            content_str=_indent(
                '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')),
                indent))
    if tag == 'pre':
        return '\n' + highlight.as_html(fragment)

    # special case figures and tables
    if tag == 'figure':
        style = attrs['style'].copy()
        width = style.pop('width', '100%')
        attrs = dict(attrs.items(), style=style)
        # FIXME(alexander): dirty hacks to fixup caption & width
        img = content[-1]
        assert img[0] == 'img'
        img[1].setdefault('style', OrderedDict())['width'] = width
        # put figcaption towards end
        if content[0][0] == 'figcaption':
            content[0], content[-1] = content[-1], content[0]
        if style['display'] == 'inline':
            ATTRS = Var('ATTRS')  # pylint: disable=C0103
            assert content[:1] == [('img', ATTRS, [])], \
                "figure does not begin with an img"
            attrs = add_class(ATTRS.val, 'margin')
            # peel of the figure tag for inlined stuff
            # as a hack to make epub/html validate
            # (figures can't occur in all contexts imgs can)
            return handle_fragments([('img', attrs, [])],
                                    bibliography=bibliography,
                                    indent=indent,
                                    transclusions=transclusions,
                                    h_shift=h_shift,
                                    epub_clean=epub_clean)
    elif tag == 'table':
        colgroups = [el for el in content if el[0] == 'colgroup']
        COLS = Var("COLS")  # pylint: disable=C0103
        assert colgroups == [('colgroup', {}, COLS)], \
                "Expected single colgroup in table %s" % content
        # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away
        # imperatively propagate table cell alignment down
        # this is a pretty horrible hack and would blow
        # up nastily if there is attribute aliasing,
        # but deepcopying should kinda make it work
        content = copy.deepcopy(content)
        _propagate_alignment(content, COLS.val)

    elif tag == 'col':
        if not epub_clean:
            attrs = attrs.copy()
            attrs['width'] = attrs['style']['width']
            del attrs['style']
        # cull
        ## return handle_fragments(content, indent)
    # FIXME(alexander): might make more sense to filter (or h-ify) these out
    # elsewhere, but for now this seems not unreasonable
    elif tag == 'title':
        tag = 'h1'
        attrs = add_class(attrs, 'title')
    elif tag == 'subtitle':
        tag = 'h2'
        attrs = add_class(attrs, 'subtitle')
    elif tag in ('CMD', 'LIT'):
        bad_command = None
        cmd_type, = attrs['class']
        # FIXME(alexander): convert tex to html for non-math;
        # convert tex math to MML for epub
        if cmd_type in ('$', 'tex'):
            tex, = content
            if cmd_type == '$':
                tex = r'\(%s\)' % tex
            return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex)
        elif CITE_REX.match(cmd_type):
            if bibliography:
                bibliography.cited.add(content[0])
                # post = ('[%s]' % content[1] if len(content) > 1 and content[1]
                #         else '')
                # Post is ignored for the moment
                return _format_citation(cmd_type, content[0], bibliography)
            else:
                docerror.docproblem(
                    'Citation exists, but bibliography is missing')
        else:
            bad_command = cmd_type + (':' if content else '')
            docerror.docproblem('Unknown command type:%s' % cmd_type)
    elif epub_clean:
        if tag == 'a' and 'name' in attrs:
            assert len(attrs) == 1
            attrs = {'id': attrs['name']}
        elif tag == 'img':
            attrs = {
                k: attrs[k]
                for k in attrs if k not in ('width', 'height')
            }

    # FIXME(alexander): support continued-list properly in html, by keeping
    # track of numbers of items per list-id and translating it to start

    if tag in H_TAGS:
        if h_shift:
            tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift))

    # generic [tagname].class tags
    if '.' in tag:
        if tag == '.pagebreak':
            tag = 'div.pagebreak'  # for whitespace sanitization
        tagname, classname = tag.split('.', 1)
        tag = tagname or 'span'
        attrs = add_class(attrs, classname)

    if tag == 'CMD' and bad_command:
        tag = 'span'
        attrs = {'class': ['bad-command']}
        content = [('u', {}, [bad_command])] + content
    elif tag == 'ERR':
        tag = 'span'
        attrs = {'class': ['err'], 'title': attrs['info'][0]}

    content_str = handle_fragments(content,
                                   indent='  ' + indent,
                                   transclusions=transclusions,
                                   h_shift=h_shift,
                                   epub_clean=epub_clean,
                                   bibliography=bibliography)
    if tag in VOID_TAGS:
        assert not content
        template = "<%(tag)s%(attrs_str)s/>"
    elif tag in INLINE:
        template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>"
    elif '\n' in content_str:
        template = NOT_INLINE_TEMPLATE
    else:
        template = COMPACT_NOT_INLINE_TEMPLATE

    # FIXME(alexander): disgusting hack; fix this properly and
    # use a set representation to start with!
    classes = attrs.get('class')
    if classes:
        attrs = attrs.copy()
        attrs['class'] = sorted(set(classes))

    return template % dict(indent=indent,
                           tag=tag,
                           attrs_str=encode_attrs(attrs, transclusions,
                                                  epub_clean),
                           content_str=content_str)
Exemplo n.º 21
0
    def latexify(self, ast):  # pylint: disable=E0102,R0914,R0915,R0911,R0912
        if isinstance(ast, list):
            return re.sub('\n\n$', '\n', join(*map(self.latexify, ast)))
        else:
            node = ast
            if isinstance(node, basestring):
                return quote(node)
            else:
                assert isinstance(node, tuple)
                h, a, b = node
                if h == 'div':  # canonicalize pseudo-elements
                    h = a['class'].pop()
                    assert not a['class']
                    del a['class']

                if h[:-1] == 'h':
                    if self.am_inside('list') or self.am_inside('table'):
                        return docwarn(
                            self.latexify(b),
                            'Cannot have sections inside lists or tables: %r' %
                            postprocess.plaintextify(b))
                    else:
                        with self.inside('section'):
                            if a:
                                log.warn('heading w/ attr %r', a)
                            labels, b = extract_labels(b)
                            return self.section(h, b, labels)
                elif h == 'p':
                    ans = nl(self.latexify(b))
                    if self.am_inside('.footnote') and self.am_inside('table'):
                        return docwarn(
                            ans, 'Multi-paragraph footnotes in tables are'
                            ' unsupported')
                    return nl(ans)
                elif h == 'span':
                    return self.latexify(b)  # XXX
                elif h in ('ol', 'ul'):
                    ol = partial(self.enumerate_,
                                 start=a.get('start'),
                                 series=a.get('id'),
                                 resume=a.get('data-continue-list'))
                    with self.inside('list'):
                        return nl(
                            freshline({
                                'ol': ol,
                                'ul': itemize
                            }[h](self.latexify(b))))
                elif h == 'li':
                    labels, b = extract_labels(b)
                    labelling = (join(*(map(mklabel, labels) +
                                        [' '])) if labels else '')
                    return join(freshline(cmd('item')), labelling,
                                self.latexify(b))
                elif h == 'table':
                    nested_table = self.am_inside('table')
                    with self.inside('table'):
                        # pylint: disable=C0103
                        CLASS_TO_SPEC = {
                            'left': 'P',
                            'center': 'C',
                            'right': 'R',
                            'justify': 'N'
                        }
                        b = b[:]
                        tablecaption = None
                        if b[0][0] == 'caption':
                            with self.inside('caption'):
                                tablecaption = self.latexify(b[0][2])
                            del b[0]

                        colgroup = [el for el in b if el[0] == 'colgroup']
                        rows = [el for el in b if el[0] == 'tr']
                        assert len(colgroup) == 1, \
                                "Expected single colgroup in table %s" % b
                        cols = colgroup[0][2]
                        colspecs = []
                        for col_h, col_a, col_b in cols:
                            if col_h != 'col':
                                break
                            assert not col_b

                            coltype = 'P'
                            for cls in CLASS_TO_SPEC:
                                if cls in col_a.get('class', []):
                                    coltype = CLASS_TO_SPEC[cls]

                            coltype = "%s{%s}" % (coltype,
                                                  textwidth_percent(
                                                      col_a['style']['width']))

                            colspecs.append(coltype)
                        rows = "\\tabularnewline\n".join(
                            map(self.latexify, rows))
                        if nested_table and tablecaption:
                            docproblem(
                                "Tables within tables can't have captions;"
                                " outputing caption as normal text",
                                level='warning')

                            ans = join(nl(table(colspecs, rows)), tablecaption)
                        else:
                            ans = table(colspecs, rows, tablecaption)
                    if self.post_float_yuck and not self.am_inside('table'):
                        ans = join(ans, *self.post_float_yuck)
                        del self.post_float_yuck[:]
                    return ans
                elif h == 'col':  # FIXME
                    assert False, "Unexpected col"
                elif h == 'tr':
                    return " & ".join(map(self.latexify, b))
                elif h == 'td':
                    if 'headcol' in a.get('class', []):
                        return colh(self.latexify(b))
                    return self.latexify(b)
                elif h == 'th':
                    if 'headcol' in a.get('class', []):
                        return rowh(colh(self.latexify(b)))
                    return rowh(self.latexify(b))
                elif h == 'figure':
                    b = b[:]
                    if b[0][0] == 'figcaption':
                        with self.inside('caption'):
                            figcaption = self.latexify(b[0][2])
                        del b[0]
                    else:
                        figcaption = None
                    assert len(b) == 1 and b[0][0] == 'img'
                    img = b[0][1]['src']
                    inline = False
                    warns = []
                    if a['style']['display'] == 'inline':
                        if self.am_inside('table'):
                            warns.append([
                                'Margin figures not supported in tables, '
                                'inserting into table cell'
                            ])
                        else:
                            inline = True
                    if inline:
                        if figcaption:
                            warns.append([
                                'Ignoring figcaption for inline figure:'
                                ' "%s"', figcaption
                            ])
                        ans = marginfigure(img=img)
                    else:
                        fakecaption = figcaption and self.am_inside('table')
                        if fakecaption:
                            warns.append([
                                "Figures in tables can't have captions; "
                                "outputing caption as normal text"
                            ])
                        # inside blockquotes more complicated figure
                        # environments don't seem to work reliably
                        rawincludegraphics = self.am_inside('blockquote')
                        ans = figure(img=img,
                                     classes=a.get('class', []),
                                     width=a['style']['width'],
                                     figcaption=figcaption,
                                     fakecaption=fakecaption,
                                     rawincludegraphics=rawincludegraphics)
                    if self.post_float_yuck and not self.am_inside('table'):
                        ans = join(ans, *self.post_float_yuck)
                        del self.post_float_yuck[:]
                    return ans if not warns else docwarns(ans, *warns)
                elif h == 'img':
                    assert False, 'unexpected image'
                elif h == 'a':
                    if 'name' in a:
                        # we can't do that blindly, because we want to
                        # generate labels for things like lists and headings
                        # this is only a fallback for anchors outside of
                        # 'labelled' envs
                        return cmd('hypertarget', [],
                                   [a['name'].lstrip('#'), ''])
                    elif 'href' in a:
                        if a['href'].startswith('#'):
                            return cmd('hyperref',
                                       [latexify_href(a['href'][1:])],
                                       [self.latexify(b)])
                        ##
                        # XXX(alexander): handle bare urls specially, because
                        # we want more relaxed linebreaking rules for them.
                        # Note that we're not using \url directly, because
                        # it's not robust and also can't cope with certain
                        # arguments, such as unbalanced '{'/'}'s. Also, even
                        # with fairly aggressive hyphenization params, this is
                        # in in itself not enough to resolve all overfull hbox
                        # issues with urls, although it's not 100% clear to me
                        # why.
                        elif b and a['href'] in (b[0], url_fix(b[0])):
                            # XXX(alexander): use url_fixed version here?
                            return urldef(a['href'], self.urldefs)
                        else:
                            ans = cmd(
                                'href', [],
                                [latexify_href(a['href']),
                                 self.latexify(b)])
                            if b[0].startswith('http'):
                                ans = docwarn(
                                    ans, 'Suspicious link with body/href'
                                    ' mismatch: %r != %r' %
                                    (a['href'].encode('utf-8'), b[0]))
                            return ans
                    else:
                        assert False, 'Malformed link: %s' % ((h, a, b), )
                elif h == 'aside':
                    return cmd('comment', [], [self.latexify(b)])
                elif h in ('b', 'i', 'u', 's'):
                    assert not a, 'unexpected <%s %r' % (h, a)
                    return self.handle_emphasis(h, b)
                elif h == 'code':
                    #FIXME: write something more specialized
                    return cmd('texttt', [], [self.latexify(b)])
                elif h == 'sup':
                    return cmd('textsuperscript', [], [self.latexify(b)])
                elif h == 'sub':
                    return cmd('textsubscript', [], [self.latexify(b)])
                elif h == '.footnote':
                    with self.inside('.footnote'):
                        if self.am_inside('caption'):
                            self.post_float_yuck.append(
                                cmd('footnotetext', [], [self.latexify(b)]))
                            return cmd(r'protect\footnotemark', [], [])
                        else:
                            return cmd('footnote', [], [self.latexify(b)])
                elif h == '.pagebreak':
                    return nl(cmd('clearpage', [], [self.latexify(b)]))
                elif h == 'br':
                    assert a == {}
                    assert b == []
                    return nl(cmd('newline'))
                elif h == 'blockquote':
                    with self.inside('blockquote'):
                        return blockquote(self.latexify(b))
                elif (h == 'footer' and b == [Seq['cite', :]]
                      and self.am_inside('blockquote')):
                    return nl(cmd('attrib', [], [self.latexify(b[0][2])]))
                elif node == ('CMD', {'class': ['$']}, b):
                    return join('$', b[0], '$')
                elif node == ('CMD', {
                        'class': [Var('CITE', CITE_REX.match)]
                }, b):
                    return self.munge_cite(node, b)
                elif node == ('CMD', {'class': ['tex']}, b):
                    return b[0]
                elif h in ('CMD', 'LIT'):
                    return self.bad_command(*node)
                elif h == 'pre':
                    return highlight.as_latex(node)
                elif h == 'wbr':
                    return '{}'
                else:
                    #FIXME(alexander): set 1 as error-code?
                    log.error('Unexpected tag: %s %r %r', h, a, b)
                    return join("")
Exemplo n.º 22
0
def parse_body(xml, context, normalize_transclusion):
    # pylint: disable=R0912,R0915,R0914
    for e in xml:
        text = (e.text or '')
        tail = (e.tail or '')

        # some style properties should be promoted to tags, e.g. underlining
        # and bolding
        tags_from_style = []
        stys_dealt_with = []

        if e.tag in (S_TAG, TAB_TAG):
            yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(
                ns.text('c'), '1'))
            if tail:
                yield tail
            continue

        if e.tag == LINEBREAK_TAG:
            yield mkel('br', {}, [])
            continue

        sty = context.stys.get(
            e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR))
        # handle page breaks
        if sty and sty.par_break:
            assert e.tag in (H_TAG, P_TAG), \
                   "Unexpected page-break in %r" % e.tag
            yield mkel('.pagebreak', {}, [])
            stys_dealt_with.append('par_break')
        # Handle lists specially
        if e.tag == LIST_TAG:
            new_context = context.bump_list_level(sty)
            stys_dealt_with.append('sub_list_styles')
        else:
            new_context = context
        body = list(parse_body(e, new_context, normalize_transclusion))
        assert type(body) is list and not body or type(body[0]) is not list
        attrs = {}
        if text:
            body = [text] + body
        if sty and sty.type.endswith('title'):
            head = sty.type
            body = [plaintextify(body)]
            sty = None
        elif e.tag == H_TAG:
            # skip empty headings; NB: this *must* happen
            # after we extracted eventual page-breaks, which are the only
            # useful information empty headings can contain
            if blank(body):
                continue
            head = sty.type
            # FIXME(alexander): keep track of the headings breadcrumbs in
            # context for two reasons
            #
            #  1. to associate errors with specific headings
            #  2. to warn about bad structure e.g. h1 followed by h4,
            #     rather than h2
        elif e.tag == LIST_TAG:
            head = new_context.list_type
            assert head in ('ol', 'ul')
            list_start = new_context.list_start
            if list_start is not None:
                assert head == 'ol'
                attrs['start'] = str(list_start)

            id_ = e.attrib.get(ns.xml('id'))  # pylint: disable=E1101
            if id_ is not None:
                attrs['id'] = id_
            continues = e.attrib.get(ns.text('continue-list'))
            if continues is not None:
                # make this a data attrib, so we can stuff it
                # into the html, which doesn't have direct support
                attrs['data-continue-list'] = continues

        elif e.tag == LIST_ITEM_TAG:
            head = 'li'
        elif e.tag == ANNOTATION_TAG:
            head = 'aside'
        elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG):
            #FIXME: extract content
            if text:
                log.warning('Hey, someone actually specified a %s: %s', e.tag,
                            text)
            if tail:
                yield tail
            continue
        elif e.tag == NOTE_TAG:
            # other valid option is 'endnote'
            assert e.attrib[ns.text('note-class')] == 'footnote'
            # skip ahead and exit early; we only represent the note-body
            assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG
            assert len(body) == 1
            yield body[0]
            if tail:
                yield tail
            continue
        elif e.tag == NOTE_BODY_TAG:
            head = '.footnote'
            # FIXME(alexander): sucky hack to strip the bogus whitespace
            # google docs enters at the beginning of a footnote for some
            # reason. I should really write a more generic whitespace
            # stripping mechanism in the postprocess module that can recognize
            # consecutive whitespace even if seperated-by/wrapped-in inline
            # tags.
            _, B1, B2, = map(Var, '_, B1, B2'.split(', '))
            SPACED_STR = Var(
                'SPACED_STR', lambda s:
                (isinstance(s, basestring) and re.match(r'\s+', s)))
            if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]:
                body[0][2][0] = SPACED_STR.val.lstrip()
        # FIXME(alexander): add anchors for all paras
        elif e.tag == P_TAG:
            margin = sty.margin_left or sty.text_indent if sty else None
            indent_level = in_indents(margin) if margin else 0
            if indent_level:
                head = '.block'
                attrs['indent'] = indent_level
            else:
                head = 'p'

        #FIXME styled links etc. gdocs might not use that...
        #... but we should be able to handle non-span bolding etc.
        elif e.tag == SPAN_TAG:
            # XXX: order can matter; we need
            #   <b><u>command</u><b>
            # not
            #   <u><b>command</b><u>
            #
            # but more generally the minimal coalescing of abutting partially
            # overlapping styles is something that needs to be thought about
            # properly at some point.
            for attr, on_values, html_tags in [
                ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']),
                ('font_style', ['italic'], ['i']),
                ('line_through', [True], ['s']),
                ('text_position', ['sub', 'super'], ['sub', 'sup'])
            ]:
                value = getattr(sty, attr, None)
                if value:
                    if value not in on_values:
                        log.error("Bad value for %s: %s in %s", attr, value,
                                  e.tag)
                        continue
                    tags_from_style.append(html_tags[on_values.index(value)])
                    stys_dealt_with.append(attr)
            if is_code_font(sty.font_family):
                tags_from_style.append('code')
                stys_dealt_with.append('font_family')
            head = 'span'
        elif e.tag == A_TAG:
            assert e.attrib[ns.xlink('type')] == 'simple'
            head = 'a'
            attrs = dict(href=e.attrib[HREF_ATTR])
            # FIXME the in 'span' check is a bit too general, should use
            # something else to markup textcolor
            body = tidy(whack(lambda x: x in ('span', 'u'), body))
        elif e.tag == BOOKMARK_START_TAG:
            head = 'a'
            attrs = dict(name=e.attrib[TEXT_NAME_ATTR])
            assert (blank(text) and blank(tail)
                    and next(e.itersiblings()).tag == BOOKMARK_END_TAG)
        elif e.tag == TABLE_TAG:
            head = 'table'
            body = parse_table_body(body)
        elif e.tag == TABLE_ROW_TAG:
            head = 'tr'
        elif e.tag == TABLE_CELL_TAG:
            head = 'td'
        #FIXME repetition via table:number-columns-repeated
        #FIXME handle column-groups
        elif e.tag == TABLE_COLUMN_TAG:
            head = 'col'
            sty = context.stys.get(e.attrib.get(ns.table('style-name')))
            if sty and sty.width is not None:
                # XXX this isn't really the column width
                # since google moronically saves this even
                # if set column width is turned off thank you google!
                attrs = dict(style=OrderedDict(width=sty.width))
                stys_dealt_with.append('width')

        elif e.tag == FRAME_TAG:
            # XXX: try to find caption
            # FIXME(alexander): keep figures/tables with captions in context,
            # so that we can produce a lot/loi; add an id for all of them
            inline = e.attrib[ns.text('anchor-type')] == 'as-char'
            width = (
                e.attrib.get(ns.svg('width'))  # pylint: disable=E1101
                or e.attrib[ns.style('rel-width')])
            # FIXME(alexander): should handle all these, in theory:
            # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute>
            # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" )
            assert width.endswith('cm'), \
                'Expected figure width in cm, got %s' % width
            relwidth = float(width[:-2]) / context.stys.textwidth
            head, attrs, body = make_figure(
                relwidth=relwidth,
                inline=inline,
                # FIXME(alexander): the body[0][1] to access the image
                # will blow up on leading whitespace in the body
                body=list(x for x in body
                          if not (isinstance(x, basestring) and blank(x))),
                src=body[0][1]['src'],
                original_href=e.find(ns.draw('image')).get(ns.xlink('href')))
        elif e.tag == IMAGE_TAG:
            head = 'img'
            attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR]))
        else:
            log.warning('Ignoring tag %s', e.tag)
            continue
            # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag)
        sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)],
                            tags_from_style, tidy(body))
        if sty:
            if sty.text_align:
                stys_dealt_with.append('text_align')
                attrs = add_class(attrs, sty.text_align)
            if sty.background_color:
                stys_dealt_with.append('background_color')
                iadd_style(attrs, 'background-color', sty.background_color)
            if sty.color:
                stys_dealt_with.append('color')
                iadd_style(attrs, 'color', sty.color)
        if e.tag == LIST_TAG:
            if new_context.list_style_type:
                attrs = add_class(attrs, new_context.list_style_type)
        # FIXME additional tidy
        parsed = mkel(head, attrs, sty_tagged)
        if head == 'span' and 'style' in attrs:
            B = Var('B')
            if parsed == ('span', attrs, [('code', {}, B)]):
                parsed = mkel('code', {}, [('span', attrs, B.val)])

        leftover_styles = sty and set(
            sty.active_props()) - set(stys_dealt_with)
        if leftover_styles:
            log.warn('Ignoring style elements: %r in %r "%s"',
                     ([(k, getattr(sty, k)) for k in leftover_styles]), head,
                     plaintextify(body))
        preprocess.maybe_anchorize_id(head, attrs, sty_tagged)
        yield parsed
        if tail:
            yield tail
def parse_chunk(s, handle_data_url=None):
    BODY = Var('BODY')
    parsed = parse_body([parse_html_frag(s).find('body')], handle_data_url)
    assert [('body', {}, BODY)] == parsed, 'No body in %r' % (parsed, )
    return BODY.val