Python plaintextify 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: converter.postprocess

메소드/함수: plaintextify

hotexamples.com에서의 예제들: 9

Python plaintextify - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 converter.postprocess.plaintextify에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: unparse.py 프로젝트: hybrid-publishing-lab/typesetr-academic

def unparse_literal(lit, roundtrip=True, plain=False): # pylint: disable=R0911
    """Return a string representation of `lit`.

    - `roundtrip` affects how literals with context-dependent values are
       hanlded, e.g.  when ``roundtrip=False`` then
       ``Date('today') -> "2014-01-01"`` (instead of ``"today"``).

    - `plain` controls if rich text content is converted to plaintext (e.g.
       for pdf or epub metadata)
    """
    # FIXME(alexander): try to break cyclic imports
    import converter.html_writer
    if lit is None:
        return '' # XXX(alexander)
    if isinstance(lit, basestring):
        return lit
    if isinstance(lit, bool):
        return ('no', 'yes')[lit]
    if not roundtrip and hasattr(lit, 'to_value'):
        return lit.to_value()
    if hasattr(lit, 'to_string'):
        return lit.to_string()
    if isinstance(lit, list): # Rich-text
        if plain:
            return plaintextify(lit)
        return converter.html_writer.write_body(lit)
    assert False, "Unknown literal type %r" % (lit,)

예제 #2

파일 보기

파일: unparse.py 프로젝트: hybrid-publishing-lab/typesetr-academic

def unparse_literal(lit, roundtrip=True, plain=False):  # pylint: disable=R0911
    """Return a string representation of `lit`.

    - `roundtrip` affects how literals with context-dependent values are
       hanlded, e.g.  when ``roundtrip=False`` then
       ``Date('today') -> "2014-01-01"`` (instead of ``"today"``).

    - `plain` controls if rich text content is converted to plaintext (e.g.
       for pdf or epub metadata)
    """
    # FIXME(alexander): try to break cyclic imports
    import converter.html_writer
    if lit is None:
        return ''  # XXX(alexander)
    if isinstance(lit, basestring):
        return lit
    if isinstance(lit, bool):
        return ('no', 'yes')[lit]
    if not roundtrip and hasattr(lit, 'to_value'):
        return lit.to_value()
    if hasattr(lit, 'to_string'):
        return lit.to_string()
    if isinstance(lit, list):  # Rich-text
        if plain:
            return plaintextify(lit)
        return converter.html_writer.write_body(lit)
    assert False, "Unknown literal type %r" % (lit, )

예제 #3

파일 보기

파일: sectionize.py 프로젝트: hybrid-publishing-lab/typesetr-academic

def tocify_heading(e, gensym):
    """Transform a heading into `('h*', {'id':ID}, [STRING])`.

    This assumes `h*` already has an id or is followed by an anchor.
    """
    # pylint: disable=C0103
    h, a, b = e
    assert h in H_TAGS
    a, b = lift_anchor_id(a, b, gensym, kill_anchor=True)
    return (h, {'id': a['id']}, [plaintextify(b)])

예제 #4

파일 보기

파일: sectionize.py 프로젝트: hybrid-publishing-lab/typesetr-academic

def tocify_heading(e, gensym):
    """Transform a heading into `('h*', {'id':ID}, [STRING])`.

    This assumes `h*` already has an id or is followed by an anchor.
    """
    # pylint: disable=C0103
    h, a, b = e
    assert h in H_TAGS
    a, b = lift_anchor_id(a, b, gensym, kill_anchor=True)
    return (h, {'id': a['id']}, [plaintextify(b)])

예제 #5

파일 보기

파일: metainfo.py 프로젝트: hybrid-publishing-lab/typesetr-academic

        def check_supplied(): # pylint: disable=R0912
            def try_to_reify(v, parse):
                try:
                    return parse(v)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except Exception as ex:
                    # pylint: disable=W0631
                    log.info('Meta conversion error on %s, %s', k, ex)
                    error('Not a valid %s format (expected %s)' % (
                        right_type, TYPE_EXAMPLES[right_type]),
                          k,
                          supplied=meta[k])

            for k in meta:
                canonical_meta[k] = meta[k] # default
                if k not in self._info:
                    maybe_meants = spellsuggest.spell_suggest(
                        k, self._info.keys())
                    suggestion = (" (did you mean '%s'?)" % maybe_meants[0]
                                  if maybe_meants else '')
                    if k not in ('title', 'subtitle'):
                        error("Unexpected field '%s'%s" % (k, suggestion),
                              k, meta[k])
                    else:
                        error("This document type does not have a %s" % k,
                              k, meta[k])
                    continue
                potential_types = PY_TYPE_TO_TYPESETR_TYPES[type(meta[k])]
                right_type = self._info[k]['type']
                if right_type in potential_types:
                    if right_type == 'bibliography':
                        parsed[k] = try_to_reify(meta[k], Bibliography)

                else:
                    if 'rich-text' in potential_types:
                        if not isinstance(meta[k], basestring):
                            meta[k] = postprocess.plaintextify(meta[k])
                        potential_types = ('text',)
                    if potential_types == ('text',):
                        parsed[k] = try_to_reify(
                            meta[k],
                            # pylint: disable=W0640
                            lambda v: parse_literal(v, right_type))
                    else:
                        error("Expected meta field '%s:' to be"
                              " of type '%s', not '%s'" % (
                                  k, right_type, potential_types[0]),
                              k, supplied=meta[k])

예제 #6

파일 보기

파일: latex_writer.py 프로젝트: hybrid-publishing-lab/typesetr-academic

    def latexify(self, ast): # pylint: disable=E0102,R0914,R0915,R0911,R0912
        if isinstance(ast, list):
            return re.sub('\n\n$', '\n',
                          join(*map(self.latexify, ast)))
        else:
            node = ast
            if isinstance(node, basestring):
                return quote(node)
            else:
                assert isinstance(node, tuple)
                h, a, b = node
                if h == 'div':  # canonicalize pseudo-elements
                    h = a['class'].pop()
                    assert not a['class']
                    del a['class']

                if h[:-1] == 'h':
                    if self.am_inside('list') or self.am_inside('table'):
                        return docwarn(
                            self.latexify(b),
                            'Cannot have sections inside lists or tables: %r' %
                            postprocess.plaintextify(b))
                    else:
                        with self.inside('section'):
                            if a:
                                log.warn('heading w/ attr %r', a)
                            labels, b = extract_labels(b)
                            return self.section(h, b, labels)
                elif h == 'p':
                    ans = nl(self.latexify(b))
                    if self.am_inside('.footnote') and self.am_inside('table'):
                        return docwarn(ans,
                                       'Multi-paragraph footnotes in tables are'
                                       ' unsupported')
                    return nl(ans)
                elif h == 'span':
                    return self.latexify(b) # XXX
                elif h in ('ol', 'ul'):
                    ol = partial(self.enumerate_,
                                 start=a.get('start'),
                                 series=a.get('id'),
                                 resume=a.get('data-continue-list'))
                    with self.inside('list'):
                        return nl(
                            freshline({
                                'ol': ol,
                                'ul': itemize}[h](
                                    self.latexify(b))))
                elif h == 'li':
                    labels, b = extract_labels(b)
                    labelling = (join(*(map(mklabel, labels) + [' ']))
                                 if labels else '')
                    return join(freshline(cmd('item')),
                                labelling, self.latexify(b))
                elif h == 'table':
                    nested_table = self.am_inside('table')
                    with self.inside('table'):
                        # pylint: disable=C0103
                        CLASS_TO_SPEC = {'left': 'P', 'center': 'C',
                                         'right': 'R', 'justify': 'N'}
                        b = b[:]
                        tablecaption = None
                        if b[0][0] == 'caption':
                            with self.inside('caption'):
                                tablecaption = self.latexify(b[0][2])
                            del b[0]

                        colgroup = [el for el in b if el[0] == 'colgroup']
                        rows = [el for el in b if el[0] == 'tr']
                        assert len(colgroup) == 1, \
                                "Expected single colgroup in table %s" % b
                        cols = colgroup[0][2]
                        colspecs = []
                        for col_h, col_a, col_b in cols:
                            if col_h != 'col':
                                break
                            assert not col_b

                            coltype = 'P'
                            for cls in CLASS_TO_SPEC:
                                if cls in col_a.get('class', []):
                                    coltype = CLASS_TO_SPEC[cls]

                            coltype = "%s{%s}" % (coltype, textwidth_percent(
                                col_a['style']['width']))

                            colspecs.append(coltype)
                        rows = "\\tabularnewline\n".join(
                            map(self.latexify, rows))
                        if nested_table and tablecaption:
                            docproblem(
                                "Tables within tables can't have captions;"
                                " outputing caption as normal text",
                                level='warning')


                            ans = join(nl(table(colspecs, rows)), tablecaption)
                        else:
                            ans = table(colspecs, rows, tablecaption)
                    if self.post_float_yuck and not self.am_inside('table'):
                        ans = join(ans, *self.post_float_yuck)
                        del self.post_float_yuck[:]
                    return ans
                elif h == 'col': # FIXME
                    assert False, "Unexpected col"
                elif h == 'tr':
                    return " & ".join(map(self.latexify, b))
                elif h == 'td':
                    if 'headcol' in a.get('class', []):
                        return colh(self.latexify(b))
                    return self.latexify(b)
                elif h == 'th':
                    if 'headcol' in a.get('class', []):
                        return rowh(colh(self.latexify(b)))
                    return rowh(self.latexify(b))
                elif h == 'figure':
                    b = b[:]
                    if b[0][0] == 'figcaption':
                        with self.inside('caption'):
                            figcaption = self.latexify(b[0][2])
                        del b[0]
                    else:
                        figcaption = None
                    assert len(b) == 1 and b[0][0] == 'img'
                    img = b[0][1]['src']
                    inline = False
                    warns = []
                    if a['style']['display'] == 'inline':
                        if self.am_inside('table'):
                            warns.append([
                                'Margin figures not supported in tables, '
                                'inserting into table cell'])
                        else:
                            inline = True
                    if inline:
                        if figcaption:
                            warns.append(
                                ['Ignoring figcaption for inline figure:'
                                 ' "%s"', figcaption])
                        ans = marginfigure(img=img)
                    else:
                        fakecaption = figcaption and self.am_inside('table')
                        if fakecaption:
                            warns.append([
                                "Figures in tables can't have captions; "
                                "outputing caption as normal text"])
                        # inside blockquotes more complicated figure
                        # environments don't seem to work reliably
                        rawincludegraphics = self.am_inside('blockquote')
                        ans = figure(img=img,
                                     classes=a.get('class', []),
                                     width=a['style']['width'],
                                     figcaption=figcaption,
                                     fakecaption=fakecaption,
                                     rawincludegraphics=rawincludegraphics)
                    if self.post_float_yuck and not self.am_inside('table'):
                        ans = join(ans, *self.post_float_yuck)
                        del self.post_float_yuck[:]
                    return ans if not warns else docwarns(ans, *warns)
                elif h == 'img':
                    assert False, 'unexpected image'
                elif h == 'a':
                    if 'name' in a:
                        # we can't do that blindly, because we want to
                        # generate labels for things like lists and headings
                        # this is only a fallback for anchors outside of
                        # 'labelled' envs
                        return cmd('hypertarget', [],
                                   [a['name'].lstrip('#'), ''])
                    elif 'href' in a:
                        if a['href'].startswith('#'):
                            return cmd('hyperref',
                                       [latexify_href(a['href'][1:])],
                                       [self.latexify(b)])
                        ##
                        # XXX(alexander): handle bare urls specially, because
                        # we want more relaxed linebreaking rules for them.
                        # Note that we're not using \url directly, because
                        # it's not robust and also can't cope with certain
                        # arguments, such as unbalanced '{'/'}'s. Also, even
                        # with fairly aggressive hyphenization params, this is
                        # in in itself not enough to resolve all overfull hbox
                        # issues with urls, although it's not 100% clear to me
                        # why.
                        elif b and a['href'] in (b[0], url_fix(b[0])):
                            # XXX(alexander): use url_fixed version here?
                            return urldef(a['href'], self.urldefs)
                        else:
                            ans = cmd('href', [], [latexify_href(a['href']),
                                                   self.latexify(b)])
                            if b[0].startswith('http'):
                                ans = docwarn(
                                    ans,
                                    'Suspicious link with body/href'
                                    ' mismatch: %r != %r' % (
                                        a['href'].encode('utf-8'), b[0]))
                            return ans
                    else:
                        assert False, 'Malformed link: %s' % ((h, a, b),)
                elif h == 'aside':
                    return cmd('comment', [], [self.latexify(b)])
                elif h in ('b', 'i', 'u', 's'):
                    assert not a, 'unexpected <%s %r' % (h, a)
                    return self.handle_emphasis(h, b)
                elif h == 'code':
                    #FIXME: write something more specialized
                    return cmd('texttt', [], [self.latexify(b)])
                elif h == 'sup':
                    return cmd('textsuperscript', [], [self.latexify(b)])
                elif h == 'sub':
                    return cmd('textsubscript', [], [self.latexify(b)])
                elif h == '.footnote':
                    with self.inside('.footnote'):
                        if self.am_inside('caption'):
                            self.post_float_yuck.append(cmd('footnotetext',
                                                            [],
                                                            [self.latexify(b)]))
                            return cmd(r'protect\footnotemark', [], [])
                        else:
                            return cmd('footnote', [], [self.latexify(b)])
                elif h == '.pagebreak':
                    return nl(cmd('clearpage', [], [self.latexify(b)]))
                elif h == 'br':
                    assert a == {}
                    assert b == []
                    return nl(cmd('newline'))
                elif h == 'blockquote':
                    with self.inside('blockquote'):
                        return blockquote(self.latexify(b))
                elif (h == 'footer' and b == [Seq['cite', :]]
                      and self.am_inside('blockquote')):
                    return nl(cmd('attrib', [], [self.latexify(b[0][2])]))
                elif node == ('CMD', {'class': ['$']}, b):
                    return join('$', b[0], '$')
                elif node == ('CMD', {'class': [Var('CITE', CITE_REX.match)]},
                              b):
                    return self.munge_cite(node, b)
                elif node == ('CMD', {'class': ['tex']}, b):
                    return b[0]
                elif h in ('CMD', 'LIT'):
                    return self.bad_command(*node)
                elif h == 'pre':
                    return highlight.as_latex(node)
                elif h == 'wbr':
                    return '{}'
                else:
                    #FIXME(alexander): set 1 as error-code?
                    log.error('Unexpected tag: %s %r %r', h, a, b)
                    return join("")

예제 #7

파일 보기

def parse_body(xml, context, normalize_transclusion):
    # pylint: disable=R0912,R0915,R0914
    for e in xml:
        text = (e.text or '')
        tail = (e.tail or '')

        # some style properties should be promoted to tags, e.g. underlining
        # and bolding
        tags_from_style = []
        stys_dealt_with = []

        if e.tag in (S_TAG, TAB_TAG):
            yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(
                ns.text('c'), '1'))
            if tail:
                yield tail
            continue

        if e.tag == LINEBREAK_TAG:
            yield mkel('br', {}, [])
            continue

        sty = context.stys.get(
            e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR))
        # handle page breaks
        if sty and sty.par_break:
            assert e.tag in (H_TAG, P_TAG), \
                   "Unexpected page-break in %r" % e.tag
            yield mkel('.pagebreak', {}, [])
            stys_dealt_with.append('par_break')
        # Handle lists specially
        if e.tag == LIST_TAG:
            new_context = context.bump_list_level(sty)
            stys_dealt_with.append('sub_list_styles')
        else:
            new_context = context
        body = list(parse_body(e, new_context, normalize_transclusion))
        assert type(body) is list and not body or type(body[0]) is not list
        attrs = {}
        if text:
            body = [text] + body
        if sty and sty.type.endswith('title'):
            head = sty.type
            body = [plaintextify(body)]
            sty = None
        elif e.tag == H_TAG:
            # skip empty headings; NB: this *must* happen
            # after we extracted eventual page-breaks, which are the only
            # useful information empty headings can contain
            if blank(body):
                continue
            head = sty.type
            # FIXME(alexander): keep track of the headings breadcrumbs in
            # context for two reasons
            #
            #  1. to associate errors with specific headings
            #  2. to warn about bad structure e.g. h1 followed by h4,
            #     rather than h2
        elif e.tag == LIST_TAG:
            head = new_context.list_type
            assert head in ('ol', 'ul')
            list_start = new_context.list_start
            if list_start is not None:
                assert head == 'ol'
                attrs['start'] = str(list_start)

            id_ = e.attrib.get(ns.xml('id'))  # pylint: disable=E1101
            if id_ is not None:
                attrs['id'] = id_
            continues = e.attrib.get(ns.text('continue-list'))
            if continues is not None:
                # make this a data attrib, so we can stuff it
                # into the html, which doesn't have direct support
                attrs['data-continue-list'] = continues

        elif e.tag == LIST_ITEM_TAG:
            head = 'li'
        elif e.tag == ANNOTATION_TAG:
            head = 'aside'
        elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG):
            #FIXME: extract content
            if text:
                log.warning('Hey, someone actually specified a %s: %s', e.tag,
                            text)
            if tail:
                yield tail
            continue
        elif e.tag == NOTE_TAG:
            # other valid option is 'endnote'
            assert e.attrib[ns.text('note-class')] == 'footnote'
            # skip ahead and exit early; we only represent the note-body
            assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG
            assert len(body) == 1
            yield body[0]
            if tail:
                yield tail
            continue
        elif e.tag == NOTE_BODY_TAG:
            head = '.footnote'
            # FIXME(alexander): sucky hack to strip the bogus whitespace
            # google docs enters at the beginning of a footnote for some
            # reason. I should really write a more generic whitespace
            # stripping mechanism in the postprocess module that can recognize
            # consecutive whitespace even if seperated-by/wrapped-in inline
            # tags.
            _, B1, B2, = map(Var, '_, B1, B2'.split(', '))
            SPACED_STR = Var(
                'SPACED_STR', lambda s:
                (isinstance(s, basestring) and re.match(r'\s+', s)))
            if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]:
                body[0][2][0] = SPACED_STR.val.lstrip()
        # FIXME(alexander): add anchors for all paras
        elif e.tag == P_TAG:
            margin = sty.margin_left or sty.text_indent if sty else None
            indent_level = in_indents(margin) if margin else 0
            if indent_level:
                head = '.block'
                attrs['indent'] = indent_level
            else:
                head = 'p'

        #FIXME styled links etc. gdocs might not use that...
        #... but we should be able to handle non-span bolding etc.
        elif e.tag == SPAN_TAG:
            # XXX: order can matter; we need
            #   <b><u>command</u><b>
            # not
            #   <u><b>command</b><u>
            #
            # but more generally the minimal coalescing of abutting partially
            # overlapping styles is something that needs to be thought about
            # properly at some point.
            for attr, on_values, html_tags in [
                ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']),
                ('font_style', ['italic'], ['i']),
                ('line_through', [True], ['s']),
                ('text_position', ['sub', 'super'], ['sub', 'sup'])
            ]:
                value = getattr(sty, attr, None)
                if value:
                    if value not in on_values:
                        log.error("Bad value for %s: %s in %s", attr, value,
                                  e.tag)
                        continue
                    tags_from_style.append(html_tags[on_values.index(value)])
                    stys_dealt_with.append(attr)
            if is_code_font(sty.font_family):
                tags_from_style.append('code')
                stys_dealt_with.append('font_family')
            head = 'span'
        elif e.tag == A_TAG:
            assert e.attrib[ns.xlink('type')] == 'simple'
            head = 'a'
            attrs = dict(href=e.attrib[HREF_ATTR])
            # FIXME the in 'span' check is a bit too general, should use
            # something else to markup textcolor
            body = tidy(whack(lambda x: x in ('span', 'u'), body))
        elif e.tag == BOOKMARK_START_TAG:
            head = 'a'
            attrs = dict(name=e.attrib[TEXT_NAME_ATTR])
            assert (blank(text) and blank(tail)
                    and next(e.itersiblings()).tag == BOOKMARK_END_TAG)
        elif e.tag == TABLE_TAG:
            head = 'table'
            body = parse_table_body(body)
        elif e.tag == TABLE_ROW_TAG:
            head = 'tr'
        elif e.tag == TABLE_CELL_TAG:
            head = 'td'
        #FIXME repetition via table:number-columns-repeated
        #FIXME handle column-groups
        elif e.tag == TABLE_COLUMN_TAG:
            head = 'col'
            sty = context.stys.get(e.attrib.get(ns.table('style-name')))
            if sty and sty.width is not None:
                # XXX this isn't really the column width
                # since google moronically saves this even
                # if set column width is turned off thank you google!
                attrs = dict(style=OrderedDict(width=sty.width))
                stys_dealt_with.append('width')

        elif e.tag == FRAME_TAG:
            # XXX: try to find caption
            # FIXME(alexander): keep figures/tables with captions in context,
            # so that we can produce a lot/loi; add an id for all of them
            inline = e.attrib[ns.text('anchor-type')] == 'as-char'
            width = (
                e.attrib.get(ns.svg('width'))  # pylint: disable=E1101
                or e.attrib[ns.style('rel-width')])
            # FIXME(alexander): should handle all these, in theory:
            # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute>
            # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" )
            assert width.endswith('cm'), \
                'Expected figure width in cm, got %s' % width
            relwidth = float(width[:-2]) / context.stys.textwidth
            head, attrs, body = make_figure(
                relwidth=relwidth,
                inline=inline,
                # FIXME(alexander): the body[0][1] to access the image
                # will blow up on leading whitespace in the body
                body=list(x for x in body
                          if not (isinstance(x, basestring) and blank(x))),
                src=body[0][1]['src'],
                original_href=e.find(ns.draw('image')).get(ns.xlink('href')))
        elif e.tag == IMAGE_TAG:
            head = 'img'
            attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR]))
        else:
            log.warning('Ignoring tag %s', e.tag)
            continue
            # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag)
        sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)],
                            tags_from_style, tidy(body))
        if sty:
            if sty.text_align:
                stys_dealt_with.append('text_align')
                attrs = add_class(attrs, sty.text_align)
            if sty.background_color:
                stys_dealt_with.append('background_color')
                iadd_style(attrs, 'background-color', sty.background_color)
            if sty.color:
                stys_dealt_with.append('color')
                iadd_style(attrs, 'color', sty.color)
        if e.tag == LIST_TAG:
            if new_context.list_style_type:
                attrs = add_class(attrs, new_context.list_style_type)
        # FIXME additional tidy
        parsed = mkel(head, attrs, sty_tagged)
        if head == 'span' and 'style' in attrs:
            B = Var('B')
            if parsed == ('span', attrs, [('code', {}, B)]):
                parsed = mkel('code', {}, [('span', attrs, B.val)])

        leftover_styles = sty and set(
            sty.active_props()) - set(stys_dealt_with)
        if leftover_styles:
            log.warn('Ignoring style elements: %r in %r "%s"',
                     ([(k, getattr(sty, k)) for k in leftover_styles]), head,
                     plaintextify(body))
        preprocess.maybe_anchorize_id(head, attrs, sty_tagged)
        yield parsed
        if tail:
            yield tail

예제 #8

파일 보기

    def latexify(self, ast):  # pylint: disable=E0102,R0914,R0915,R0911,R0912
        if isinstance(ast, list):
            return re.sub('\n\n$', '\n', join(*map(self.latexify, ast)))
        else:
            node = ast
            if isinstance(node, basestring):
                return quote(node)
            else:
                assert isinstance(node, tuple)
                h, a, b = node
                if h == 'div':  # canonicalize pseudo-elements
                    h = a['class'].pop()
                    assert not a['class']
                    del a['class']

                if h[:-1] == 'h':
                    if self.am_inside('list') or self.am_inside('table'):
                        return docwarn(
                            self.latexify(b),
                            'Cannot have sections inside lists or tables: %r' %
                            postprocess.plaintextify(b))
                    else:
                        with self.inside('section'):
                            if a:
                                log.warn('heading w/ attr %r', a)
                            labels, b = extract_labels(b)
                            return self.section(h, b, labels)
                elif h == 'p':
                    ans = nl(self.latexify(b))
                    if self.am_inside('.footnote') and self.am_inside('table'):
                        return docwarn(
                            ans, 'Multi-paragraph footnotes in tables are'
                            ' unsupported')
                    return nl(ans)
                elif h == 'span':
                    return self.latexify(b)  # XXX
                elif h in ('ol', 'ul'):
                    ol = partial(self.enumerate_,
                                 start=a.get('start'),
                                 series=a.get('id'),
                                 resume=a.get('data-continue-list'))
                    with self.inside('list'):
                        return nl(
                            freshline({
                                'ol': ol,
                                'ul': itemize
                            }[h](self.latexify(b))))
                elif h == 'li':
                    labels, b = extract_labels(b)
                    labelling = (join(*(map(mklabel, labels) +
                                        [' '])) if labels else '')
                    return join(freshline(cmd('item')), labelling,
                                self.latexify(b))
                elif h == 'table':
                    nested_table = self.am_inside('table')
                    with self.inside('table'):
                        # pylint: disable=C0103
                        CLASS_TO_SPEC = {
                            'left': 'P',
                            'center': 'C',
                            'right': 'R',
                            'justify': 'N'
                        }
                        b = b[:]
                        tablecaption = None
                        if b[0][0] == 'caption':
                            with self.inside('caption'):
                                tablecaption = self.latexify(b[0][2])
                            del b[0]

                        colgroup = [el for el in b if el[0] == 'colgroup']
                        rows = [el for el in b if el[0] == 'tr']
                        assert len(colgroup) == 1, \
                                "Expected single colgroup in table %s" % b
                        cols = colgroup[0][2]
                        colspecs = []
                        for col_h, col_a, col_b in cols:
                            if col_h != 'col':
                                break
                            assert not col_b

                            coltype = 'P'
                            for cls in CLASS_TO_SPEC:
                                if cls in col_a.get('class', []):
                                    coltype = CLASS_TO_SPEC[cls]

                            coltype = "%s{%s}" % (coltype,
                                                  textwidth_percent(
                                                      col_a['style']['width']))

                            colspecs.append(coltype)
                        rows = "\\tabularnewline\n".join(
                            map(self.latexify, rows))
                        if nested_table and tablecaption:
                            docproblem(
                                "Tables within tables can't have captions;"
                                " outputing caption as normal text",
                                level='warning')

                            ans = join(nl(table(colspecs, rows)), tablecaption)
                        else:
                            ans = table(colspecs, rows, tablecaption)
                    if self.post_float_yuck and not self.am_inside('table'):
                        ans = join(ans, *self.post_float_yuck)
                        del self.post_float_yuck[:]
                    return ans
                elif h == 'col':  # FIXME
                    assert False, "Unexpected col"
                elif h == 'tr':
                    return " & ".join(map(self.latexify, b))
                elif h == 'td':
                    if 'headcol' in a.get('class', []):
                        return colh(self.latexify(b))
                    return self.latexify(b)
                elif h == 'th':
                    if 'headcol' in a.get('class', []):
                        return rowh(colh(self.latexify(b)))
                    return rowh(self.latexify(b))
                elif h == 'figure':
                    b = b[:]
                    if b[0][0] == 'figcaption':
                        with self.inside('caption'):
                            figcaption = self.latexify(b[0][2])
                        del b[0]
                    else:
                        figcaption = None
                    assert len(b) == 1 and b[0][0] == 'img'
                    img = b[0][1]['src']
                    inline = False
                    warns = []
                    if a['style']['display'] == 'inline':
                        if self.am_inside('table'):
                            warns.append([
                                'Margin figures not supported in tables, '
                                'inserting into table cell'
                            ])
                        else:
                            inline = True
                    if inline:
                        if figcaption:
                            warns.append([
                                'Ignoring figcaption for inline figure:'
                                ' "%s"', figcaption
                            ])
                        ans = marginfigure(img=img)
                    else:
                        fakecaption = figcaption and self.am_inside('table')
                        if fakecaption:
                            warns.append([
                                "Figures in tables can't have captions; "
                                "outputing caption as normal text"
                            ])
                        # inside blockquotes more complicated figure
                        # environments don't seem to work reliably
                        rawincludegraphics = self.am_inside('blockquote')
                        ans = figure(img=img,
                                     classes=a.get('class', []),
                                     width=a['style']['width'],
                                     figcaption=figcaption,
                                     fakecaption=fakecaption,
                                     rawincludegraphics=rawincludegraphics)
                    if self.post_float_yuck and not self.am_inside('table'):
                        ans = join(ans, *self.post_float_yuck)
                        del self.post_float_yuck[:]
                    return ans if not warns else docwarns(ans, *warns)
                elif h == 'img':
                    assert False, 'unexpected image'
                elif h == 'a':
                    if 'name' in a:
                        # we can't do that blindly, because we want to
                        # generate labels for things like lists and headings
                        # this is only a fallback for anchors outside of
                        # 'labelled' envs
                        return cmd('hypertarget', [],
                                   [a['name'].lstrip('#'), ''])
                    elif 'href' in a:
                        if a['href'].startswith('#'):
                            return cmd('hyperref',
                                       [latexify_href(a['href'][1:])],
                                       [self.latexify(b)])
                        ##
                        # XXX(alexander): handle bare urls specially, because
                        # we want more relaxed linebreaking rules for them.
                        # Note that we're not using \url directly, because
                        # it's not robust and also can't cope with certain
                        # arguments, such as unbalanced '{'/'}'s. Also, even
                        # with fairly aggressive hyphenization params, this is
                        # in in itself not enough to resolve all overfull hbox
                        # issues with urls, although it's not 100% clear to me
                        # why.
                        elif b and a['href'] in (b[0], url_fix(b[0])):
                            # XXX(alexander): use url_fixed version here?
                            return urldef(a['href'], self.urldefs)
                        else:
                            ans = cmd(
                                'href', [],
                                [latexify_href(a['href']),
                                 self.latexify(b)])
                            if b[0].startswith('http'):
                                ans = docwarn(
                                    ans, 'Suspicious link with body/href'
                                    ' mismatch: %r != %r' %
                                    (a['href'].encode('utf-8'), b[0]))
                            return ans
                    else:
                        assert False, 'Malformed link: %s' % ((h, a, b), )
                elif h == 'aside':
                    return cmd('comment', [], [self.latexify(b)])
                elif h in ('b', 'i', 'u', 's'):
                    assert not a, 'unexpected <%s %r' % (h, a)
                    return self.handle_emphasis(h, b)
                elif h == 'code':
                    #FIXME: write something more specialized
                    return cmd('texttt', [], [self.latexify(b)])
                elif h == 'sup':
                    return cmd('textsuperscript', [], [self.latexify(b)])
                elif h == 'sub':
                    return cmd('textsubscript', [], [self.latexify(b)])
                elif h == '.footnote':
                    with self.inside('.footnote'):
                        if self.am_inside('caption'):
                            self.post_float_yuck.append(
                                cmd('footnotetext', [], [self.latexify(b)]))
                            return cmd(r'protect\footnotemark', [], [])
                        else:
                            return cmd('footnote', [], [self.latexify(b)])
                elif h == '.pagebreak':
                    return nl(cmd('clearpage', [], [self.latexify(b)]))
                elif h == 'br':
                    assert a == {}
                    assert b == []
                    return nl(cmd('newline'))
                elif h == 'blockquote':
                    with self.inside('blockquote'):
                        return blockquote(self.latexify(b))
                elif (h == 'footer' and b == [Seq['cite', :]]
                      and self.am_inside('blockquote')):
                    return nl(cmd('attrib', [], [self.latexify(b[0][2])]))
                elif node == ('CMD', {'class': ['$']}, b):
                    return join('$', b[0], '$')
                elif node == ('CMD', {
                        'class': [Var('CITE', CITE_REX.match)]
                }, b):
                    return self.munge_cite(node, b)
                elif node == ('CMD', {'class': ['tex']}, b):
                    return b[0]
                elif h in ('CMD', 'LIT'):
                    return self.bad_command(*node)
                elif h == 'pre':
                    return highlight.as_latex(node)
                elif h == 'wbr':
                    return '{}'
                else:
                    #FIXME(alexander): set 1 as error-code?
                    log.error('Unexpected tag: %s %r %r', h, a, b)
                    return join("")

예제 #9

파일 보기

파일: odt_parser.py 프로젝트: hybrid-publishing-lab/typesetr-academic

def parse_body(xml, context, normalize_transclusion):
    # pylint: disable=R0912,R0915,R0914
    for e in xml:
        text = (e.text or '')
        tail = (e.tail or '')

        # some style properties should be promoted to tags, e.g. underlining
        # and bolding
        tags_from_style = []
        stys_dealt_with = []

        if e.tag in (S_TAG, TAB_TAG):
            yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(ns.text('c'), '1'))
            if tail:
                yield tail
            continue

        if e.tag == LINEBREAK_TAG:
            yield mkel('br', {}, [])
            continue

        sty = context.stys.get(e.get(STYLE_NAME_ATTR) or
                               e.get(TABLE_STYLE_NAME_ATTR))
        # handle page breaks
        if sty and sty.par_break:
            assert e.tag in (H_TAG, P_TAG), \
                   "Unexpected page-break in %r" % e.tag
            yield mkel('.pagebreak', {}, [])
            stys_dealt_with.append('par_break')
        # Handle lists specially
        if e.tag == LIST_TAG:
            new_context = context.bump_list_level(sty)
            stys_dealt_with.append('sub_list_styles')
        else:
            new_context = context
        body = list(parse_body(e, new_context, normalize_transclusion))
        assert type(body) is list and not body or type(body[0]) is not list
        attrs = {}
        if text:
            body = [text] + body
        if sty and sty.type.endswith('title'):
            head = sty.type
            body = [plaintextify(body)]
            sty = None
        elif e.tag == H_TAG:
            # skip empty headings; NB: this *must* happen
            # after we extracted eventual page-breaks, which are the only
            # useful information empty headings can contain
            if blank(body):
                continue
            head = sty.type
            # FIXME(alexander): keep track of the headings breadcrumbs in
            # context for two reasons
            #
            #  1. to associate errors with specific headings
            #  2. to warn about bad structure e.g. h1 followed by h4,
            #     rather than h2
        elif e.tag == LIST_TAG:
            head = new_context.list_type
            assert head in ('ol', 'ul')
            list_start = new_context.list_start
            if list_start is not None:
                assert head == 'ol'
                attrs['start'] = str(list_start)

            id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101
            if id_ is not None:
                attrs['id'] = id_
            continues = e.attrib.get(ns.text('continue-list'))
            if continues is not None:
                # make this a data attrib, so we can stuff it
                # into the html, which doesn't have direct support
                attrs['data-continue-list'] = continues

        elif e.tag == LIST_ITEM_TAG:
            head = 'li'
        elif e.tag == ANNOTATION_TAG:
            head = 'aside'
        elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG):
            #FIXME: extract content
            if text:
                log.warning('Hey, someone actually specified a %s: %s',
                            e.tag, text)
            if tail:
                yield tail
            continue
        elif e.tag == NOTE_TAG:
            # other valid option is 'endnote'
            assert e.attrib[ns.text('note-class')] == 'footnote'
            # skip ahead and exit early; we only represent the note-body
            assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG
            assert len(body) == 1
            yield body[0]
            if tail:
                yield tail
            continue
        elif e.tag == NOTE_BODY_TAG:
            head = '.footnote'
            # FIXME(alexander): sucky hack to strip the bogus whitespace
            # google docs enters at the beginning of a footnote for some
            # reason. I should really write a more generic whitespace
            # stripping mechanism in the postprocess module that can recognize
            # consecutive whitespace even if seperated-by/wrapped-in inline
            # tags.
            _, B1, B2, = map(Var, '_, B1, B2'.split(', '))
            SPACED_STR = Var('SPACED_STR', lambda s: (isinstance(s, basestring)
                                                      and re.match(r'\s+', s)))
            if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]:
                body[0][2][0] = SPACED_STR.val.lstrip()
        # FIXME(alexander): add anchors for all paras
        elif e.tag == P_TAG:
            margin = sty.margin_left or sty.text_indent if sty else None
            indent_level = in_indents(margin) if margin else 0
            if indent_level:
                head = '.block'
                attrs['indent'] = indent_level
            else:
                head = 'p'

        #FIXME styled links etc. gdocs might not use that...
        #... but we should be able to handle non-span bolding etc.
        elif e.tag == SPAN_TAG:
            # XXX: order can matter; we need
            #   <b><u>command</u><b>
            # not
            #   <u><b>command</b><u>
            #
            # but more generally the minimal coalescing of abutting partially
            # overlapping styles is something that needs to be thought about
            # properly at some point.
            for attr, on_values, html_tags in [
                    ('underline', [True], ['u']),
                    ('font_weight', ['bold'], ['b']),
                    ('font_style', ['italic'], ['i']),
                    ('line_through', [True], ['s']),
                    ('text_position',
                     ['sub', 'super'],
                     ['sub', 'sup'])
            ]:
                value = getattr(sty, attr, None)
                if value:
                    if value not in on_values:
                        log.error("Bad value for %s: %s in %s",
                                  attr, value, e.tag)
                        continue
                    tags_from_style.append(html_tags[on_values.index(value)])
                    stys_dealt_with.append(attr)
            if is_code_font(sty.font_family):
                tags_from_style.append('code')
                stys_dealt_with.append('font_family')
            head = 'span'
        elif e.tag == A_TAG:
            assert e.attrib[ns.xlink('type')] == 'simple'
            head = 'a'
            attrs = dict(href=e.attrib[HREF_ATTR])
            # FIXME the in 'span' check is a bit too general, should use
            # something else to markup textcolor
            body = tidy(whack(lambda x: x in ('span', 'u'), body))
        elif e.tag == BOOKMARK_START_TAG:
            head = 'a'
            attrs = dict(name=e.attrib[TEXT_NAME_ATTR])
            assert (blank(text) and blank(tail) and
                    next(e.itersiblings()).tag == BOOKMARK_END_TAG)
        elif e.tag == TABLE_TAG:
            head = 'table'
            body = parse_table_body(body)
        elif e.tag == TABLE_ROW_TAG:
            head = 'tr'
        elif e.tag == TABLE_CELL_TAG:
            head = 'td'
        #FIXME repetition via table:number-columns-repeated
        #FIXME handle column-groups
        elif e.tag == TABLE_COLUMN_TAG:
            head = 'col'
            sty = context.stys.get(e.attrib.get(ns.table('style-name')))
            if sty and sty.width is not None:
                # XXX this isn't really the column width
                # since google moronically saves this even
                # if set column width is turned off thank you google!
                attrs = dict(style=OrderedDict(width=sty.width))
                stys_dealt_with.append('width')

        elif e.tag == FRAME_TAG:
            # XXX: try to find caption
            # FIXME(alexander): keep figures/tables with captions in context,
            # so that we can produce a lot/loi; add an id for all of them
            inline = e.attrib[ns.text('anchor-type')] == 'as-char'
            width = (e.attrib.get(ns.svg('width')) # pylint: disable=E1101
                     or e.attrib[ns.style('rel-width')])
            # FIXME(alexander): should handle all these, in theory:
            # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute>
            # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" )
            assert width.endswith('cm'), \
                'Expected figure width in cm, got %s' % width
            relwidth = float(width[:-2]) / context.stys.textwidth
            head, attrs, body = make_figure(
                relwidth=relwidth, inline=inline,
                # FIXME(alexander): the body[0][1] to access the image
                # will blow up on leading whitespace in the body
                body=list(x for x in body
                          if not (isinstance(x, basestring) and blank(x))),
                src=body[0][1]['src'],
                original_href=e.find(ns.draw('image')).get(ns.xlink('href')))
        elif e.tag == IMAGE_TAG:
            head = 'img'
            attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR]))
        else:
            log.warning('Ignoring tag %s', e.tag)
            continue
            # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag)
        sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)],
                            tags_from_style, tidy(body))
        if sty:
            if sty.text_align:
                stys_dealt_with.append('text_align')
                attrs = add_class(attrs, sty.text_align)
            if sty.background_color:
                stys_dealt_with.append('background_color')
                iadd_style(attrs, 'background-color', sty.background_color)
            if sty.color:
                stys_dealt_with.append('color')
                iadd_style(attrs, 'color', sty.color)
        if e.tag == LIST_TAG:
            if new_context.list_style_type:
                attrs = add_class(attrs, new_context.list_style_type)
        # FIXME additional tidy
        parsed = mkel(head, attrs, sty_tagged)
        if head == 'span' and 'style' in attrs:
            B = Var('B')
            if parsed == ('span', attrs, [('code', {}, B)]):
                parsed = mkel('code', {}, [('span', attrs, B.val)])

        leftover_styles = sty and set(sty.active_props()) - set(stys_dealt_with)
        if leftover_styles:
            log.warn('Ignoring style elements: %r in %r "%s"', (
                [(k, getattr(sty, k)) for k in leftover_styles]), head,
                     plaintextify(body))
        preprocess.maybe_anchorize_id(head, attrs, sty_tagged)
        yield parsed
        if tail:
            yield tail