Пример #1
0
def fix_header_id(header):
    ID = header.get('id', None)
    prefix = None if (ID is None or not ':' in ID) else ID[:ID.index(':')]

    allowed_prefixes_h = {
        'h1': ['sec', 'app', 'part'],
        'h2': ['sub', 'appsub'],
        'h3': ['subsub', 'appsubsub'],
        'h4': ['par'],
    }

    if header.name in allowed_prefixes_h:
        allowed_prefixes = allowed_prefixes_h[header.name]
        default_prefix = allowed_prefixes[0]

        if ID is None:
            header['id'] = '%s:%s' % (default_prefix, GlobalCounter.header_id)
            GlobalCounter.header_id += 1
        else:
            if prefix is None:
                if ID != 'booktitle':
                    msg = ('Adding prefix %r to current id %r for %s.' %
                           (default_prefix, ID, header.name))
                    header.insert_before(Comment('Warning: ' + msg))
                    header['id'] = default_prefix + ':' + ID
            else:
                if prefix not in allowed_prefixes:
                    msg = ('The prefix %r is not allowed for %s (ID=%r)' %
                           (prefix, header.name, ID))
                    logger.error(msg)
                    header.insert_after(Comment('Error: ' + msg))
Пример #2
0
    def test_text_acquisition_methods(self):
        # These methods are intended for use against Tag, but they
        # work on NavigableString as well,

        s = NavigableString("fee ")
        cdata = CData("fie ")
        comment = Comment("foe ")

        assert "fee " == s.get_text()
        assert "fee" == s.get_text(strip=True)
        assert ["fee "] == list(s.strings)
        assert ["fee"] == list(s.stripped_strings)
        assert ["fee "] == list(s._all_strings())

        assert "fie " == cdata.get_text()
        assert "fie" == cdata.get_text(strip=True)
        assert ["fie "] == list(cdata.strings)
        assert ["fie"] == list(cdata.stripped_strings)
        assert ["fie "] == list(cdata._all_strings())

        # Since a Comment isn't normally considered 'text',
        # these methods generally do nothing.
        assert "" == comment.get_text()
        assert [] == list(comment.strings)
        assert [] == list(comment.stripped_strings)
        assert [] == list(comment._all_strings())

        # Unless you specifically say that comments are okay.
        assert "foe" == comment.get_text(strip=True, types=Comment)
        assert "foe " == comment.get_text(types=(Comment, NavigableString))
Пример #3
0
    def test_smooth(self):
        soup = self.soup("<div>a</div>")
        div = soup.div
        div.append("b")
        div.append("c")
        div.append(Comment("Comment 1"))
        div.append(Comment("Comment 2"))
        div.append("d")
        builder = self.default_builder()
        span = Tag(soup, builder, 'span')
        span.append('1')
        span.append('2')
        div.append(span)

        # At this point the tree has a bunch of adjacent
        # NavigableStrings. This is normal, but it has no meaning in
        # terms of HTML, so we may want to smooth things out for
        # output.

        # Since the <span> tag has two children, its .string is None.
        assert None == div.span.string

        assert 7 == len(div.contents)
        div.smooth()
        assert 5 == len(div.contents)

        # The three strings at the beginning of div.contents have been
        # merged into on string.
        #
        assert 'abc' == div.contents[0]

        # The call is recursive -- the <span> tag was also smoothed.
        assert '12' == div.span.string

        # The two comments have _not_ been merged, even though
        # comments are strings. Merging comments would change the
        # meaning of the HTML.
        assert 'Comment 1' == div.contents[1]
        assert 'Comment 2' == div.contents[2]
Пример #4
0
print(tag)
print(tag.contents)
print(tag.get_text())

new_b = soup.new_tag("b")
new_b.string = "This is a new tag "
tag.insert(0,new_b)
tag.smooth()
print(tag.encode())
print(tag.decode())
"""
printer = pprint.PrettyPrinter()
printer.pprint(soup.head)
"""
# formatter
from bs4.formatter import HTMLFormatter
def uppercase(str):
    return str.upper()
formatter = HTMLFormatter(uppercase)
print(tag.prettify(formatter=formatter))
# comments
from bs4.element import Comment
my_comment = Comment("This is a comment")
tag.insert(2,my_comment)
print(tag.prettify(formatter=formatter))
print(tag.get_text())

print(soup("a") == soup.find_all("a"))


Пример #5
0
def make_videos_(o, res, location, raise_on_errors):
    if 'src' not in o.attrs:
        msg = 'The video does not have a "src" attribute.'
        res.note_error(msg, HTMLIDLocation.for_element(o, location))
        return
        # raise_desc(ValueError, msg, element=str(o))

    src = o.attrs['src']
    prefix = 'vimeo:'
    if not src.startswith(prefix):
        msg = 'Invalid src attribute "%s": it does not start with %r.' % (
            src, prefix)
        res.note_error(msg, HTMLIDLocation.for_element(o, location))
        return
        # raise_desc(ValueError, msg, element=str(o))

    vimeo_id = src[len(prefix):]

    #     <iframe src="https://player.vimeo.com/video/152233002"
    #         class="embed-responsive-item"
    #         frameborder="0" webkitallowfullscreen="" mozallowfullscreen="" allowfullscreen="">

    try:
        vimeo_info = get_vimeo_info(vimeo_id)
    except VimeoInfoException as e:
        if raise_on_errors:
            raise
        else:
            msg = str(e)
            # note_error2(o, 'Resource error', str(e))
            res.note_error(msg, HTMLIDLocation.for_element(o, location))
            return

    d = Tag(name='div')
    d.attrs['class'] = 'video'

    ONLY_WEB = 'only-web'
    ONLY_EBOOK = 'only-ebook'
    ONLY_DEADTREE = 'only-deadtree'

    d.append(Comment('This is the iframe, for online playing.'))
    C = Tag(name='div')
    C.attrs['class'] = ONLY_WEB
    if True:
        r = Tag(name='iframe')
        r.attrs['class'] = 'video-vimeo-player'
        r.attrs['src'] = 'https://player.vimeo.com/video/' + vimeo_id
        r.attrs['frameborder'] = 0
        r.attrs['webkitallowfullscreen'] = 1
        r.attrs['mozallowfullscreen'] = 1
        r.attrs['allowfullscreen'] = 1
        C.append(r)
    d.append(C)

    d.append(Comment('This is the thumbnail, for ebook'))
    C = Tag(name='div')
    C.attrs['class'] = ONLY_EBOOK
    if True:
        a = Tag(name='a')
        a.attrs['href'] = vimeo_info.url
        img = Tag(name='img')
        img.attrs['class'] = 'video-vimeo-thumbnail-ebook'
        img.attrs['src'] = vimeo_info.thumbnail_large
        img.attrs['title'] = vimeo_info.title
        a.append(img)
        C.append(a)
    d.append(C)

    d.append(Comment('This is the textual version for printing.'))
    C = Tag(name='div')
    C.attrs['class'] = ONLY_DEADTREE
    if True:
        img = Tag(name='img')
        img.attrs['class'] = 'video-vimeo-thumbnail-deadtree'
        img.attrs['src'] = vimeo_info.thumbnail_large
        img.attrs['title'] = vimeo_info.title
        C.append(img)
        p = Tag(name='p')
        p.append("The video is at %s." % vimeo_info.url)
        C.append(p)
    d.append(C)

    for att in ['style']:
        if att in o.attrs:
            d.attrs[att] = o.attrs[att]
    o.replace_with(d)
Пример #6
0
def manual_join(template,
                files_contents,
                stylesheet,
                remove=None,
                extra_css=None,
                remove_selectors=None,
                hook_before_toc=None,
                references=None,
                resolve_references=True,
                hook_before_final_pass=None,
                require_toc_placeholder=False,
                permalink_prefix=None,
                crossrefs_aug=None,
                aug0=None):
    """
        files_contents: a list of tuples that can be cast to DocToJoin:
        where the string is a unique one to be used for job naming.

        extra_css: if not None, a string of more CSS to be added
        Remove_selectors: list of selectors to remove (e.g. ".draft").

        hook_before_toc if not None is called with hook_before_toc(soup=soup)
        just before generating the toc
    """
    result = AugmentedResult()

    if references is None:
        references = {}
    check_isinstance(files_contents, list)

    if crossrefs_aug is None:
        crossrefs = Tag(name='no-cross-refs')
    else:
        crossrefs = bs(crossrefs_aug.get_result())
        result.merge(crossrefs_aug)
    if aug0 is not None:
        result.merge(aug0)

    @contextmanager
    def timeit(_):
        yield

    with timeit('manual_join'):

        files_contents = [DocToJoin(*_) for _ in files_contents]

        # cannot use bs because entire document
        with timeit('parsing template'):
            template0 = template
            template = replace_macros(template)
            template_soup = BeautifulSoup(template,
                                          'lxml',
                                          from_encoding='utf-8')
            d = template_soup
            if d.html is None:
                s = "Invalid template"
                raise_desc(ValueError, s, template0=template0)

        with timeit('adding head'):
            assert d.html is not None
            assert '<html' in str(d)
            head = d.find('head')
            if head is None:
                msg = 'Could not find <head> in template:'
                logger.error(msg)
                logger.error(str(d))
                raise Exception(msg)
            assert head is not None
            for x in get_manual_css_frag().contents:
                head.append(x.__copy__())

        with timeit('adding stylesheet'):
            if stylesheet is not None:
                link = Tag(name='link')
                link['rel'] = 'stylesheet'
                link['type'] = 'text/css'
                from mcdp_report.html import get_css_filename
                link['href'] = get_css_filename('compiled/%s' % stylesheet)
                head.append(link)

        with timeit('making basename2soup'):
            basename2soup = OrderedDict()
            for doc_to_join in files_contents:
                if doc_to_join.docname in basename2soup:
                    msg = 'Repeated docname %r' % doc_to_join.docname
                    raise ValueError(msg)
                from .latex.latex_preprocess import assert_not_inside
                if isinstance(doc_to_join.contents, AugmentedResult):
                    result.merge(doc_to_join.contents)
                    contents = doc_to_join.contents.get_result()
                else:
                    contents = doc_to_join.contents
                assert_not_inside(contents, '<fragment')
                assert_not_inside(contents, 'DOCTYPE')

                frag = bs(contents)
                basename2soup[doc_to_join.docname] = frag

        # with timeit('fix_duplicate_ids'):
        # XXX
        # fix_duplicated_ids(basename2soup)

        with timeit('copy contents'):
            body = d.find('body')
            add_comments = False

            for docname, content in basename2soup.items():
                if add_comments:
                    body.append(NavigableString('\n\n'))
                    body.append(
                        Comment('Beginning of document dump of %r' % docname))
                    body.append(NavigableString('\n\n'))

                try_faster = True
                if try_faster:
                    for e in list(content.children):
                        body.append(e.extract())
                else:
                    copy_contents_into(content, body)

                if add_comments:
                    body.append(NavigableString('\n\n'))
                    body.append(Comment('End of document dump of %r' %
                                        docname))
                    body.append(NavigableString('\n\n'))

        with timeit('extract_bibtex_blocks'):
            extract_bibtex_blocks(d)

        with timeit('ID_PUT_BIB_HERE'):

            ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE

            bibhere = d.find('div', id=ID_PUT_BIB_HERE)
            if bibhere is None:
                msg = ('Could not find #%s in document. '
                       'Adding one at end of document.') % ID_PUT_BIB_HERE
                result.note_warning(msg)
                bibhere = Tag(name='div')
                bibhere.attrs['id'] = ID_PUT_BIB_HERE
                d.find('body').append(bibhere)

            do_bib(d, bibhere)

        with timeit('hook_before_final_pass'):
            if hook_before_final_pass is not None:
                hook_before_final_pass(soup=d)

        with timeit('document_final_pass_before_toc'):
            location = LocationUnknown()
            document_final_pass_before_toc(d, remove, remove_selectors, result,
                                           location)

        with timeit('hook_before_toc'):
            if hook_before_toc is not None:
                hook_before_toc(soup=d)

        with timeit('generate_and_add_toc'):
            try:
                generate_and_add_toc(d, raise_error=True, res=result)
            except NoTocPlaceholder as e:
                if require_toc_placeholder:
                    msg = 'Could not find toc placeholder: %s' % e
                    # logger.error(msg)
                    if aug0 is not None:
                        result.note_error(msg)
                    else:
                        raise Exception(msg)

        with timeit('document_final_pass_after_toc'):
            document_final_pass_after_toc(
                soup=d,
                crossrefs=crossrefs,
                resolve_references=resolve_references,
                res=result)

        if extra_css is not None:
            logger.info('adding extra CSS')
            add_extra_css(d, extra_css)

        with timeit('document_only_once'):
            document_only_once(d)

        location = LocationUnknown()
        substitute_github_refs(d, defaults={}, res=result, location=location)

        with timeit('another A pass'):
            for a in d.select('a[href]'):
                href = a.attrs['href']
                if href in references:
                    r = references[href]
                    a.attrs['href'] = r.url
                    if not a.children:  # empty
                        a.append(r.title)

        # do not use to_html_stripping_fragment - this is a complete doc
        # mark_in_html(result, soup=d)

        add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix)

        with timeit('converting to string'):
            res = unicode(d)

        with timeit('encoding'):
            res = res.encode('utf8')

        logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0)))

        result.set_result(res)
        return result
Пример #7
0
def col_macro_(e, ncols):
    """
        Bug: For some reasone bd4 removes the whitespace I use for indentation.
        
    
    """
    assert e.name == 'div' 
    assert e.has_attr('make-col%d' % ncols)
    
#     print describe_tag(e)
    children = list(e.children) 
    # remove strings from this
    is_string = lambda x: isinstance(x, NavigableString)
    strings = [_ for _ in children if is_string(_)]
    children = [_ for _ in children if not is_string(_)]
    
    if len(children) < ncols:
        msg = ('Cannot create table with %r cols with only %d children' % 
               (ncols, len(children)))
        raise_desc(ValueError, msg, tag=describe_tag(e))
    
    for c in children:
        c.extract()
        
    for s in strings:
        ss = str(s)
        empty = not ss.strip()
        if not empty:
            msg = 'Found nonempty string %r between children.' % ss 
            raise_desc(ValueError, msg, tag=describe_tag(e))
        # remove it
        s.extract()
        
    nchildren = len(children)
    nrows = int(math.ceil(nchildren / float(ncols)))
    
    parent = e.parent
    original_position = parent.index(e)
    e.extract()
    table = e
    e.name = 'table'
    add_class(table, 'col%d' % ncols)
    add_class(table, 'colN') 
    
    wrapper = Tag(name='div')
    add_class(wrapper, 'col%d-wrap' % ncols)
    add_class(wrapper, 'colN-wrap')
    
    NL = '\n'
    # S = '-' * 4
    # XXX: change to above to see the problem with indentation
    S = ' ' * 4
    tbody = Tag(name='tbody')
    for row in range(nrows):
        tbody.append(NavigableString(NL))
        tbody.append(NavigableString(S+S))
        tr = Tag(name='tr')
        tr.append(NavigableString(NL))
        for col in range(ncols):
            td = Tag(name='td')
            i = col + row * ncols
            if i < len(children):
                child = children[i]
                td.append(child)
            else:
                td.append(Comment('empty row %d col %d' % (row, col)))
            tr.append(NavigableString(S+S+S))
            tr.append(td)
            tr.append(NavigableString(NL))
        tr.append(S+S)
        if row == 0 and ('labels-row1' in e.attrs.get('class', '')):
            thead = Tag(name='thead')
            thead.append(tr)
            table.append(thead) # add in table, not tbody
        else:
            tbody.append(tr)   # add in tbody
        tbody.append(NavigableString(NL+S))
    table.append(tbody)
    
    wrapper.append(NavigableString(NL + S))  
    wrapper.append(table)
    wrapper.append(NavigableString(NL))
    
    parent.insert(original_position, wrapper) 
    
    
Пример #8
0
def check_if_any_href_is_invalid(soup):
    '''
         Checks if references are invalid and tries to correct them.

        if it is of the form "#frag?query" then query is stripped out
    '''
    logger.debug('check_if_any_href_is_invalid')

    errors = []
    math_errors = []

    # let's first find all the IDs
    id2element, duplicates = get_id2element(soup, 'id')
    _name2element, _duplicates = get_id2element(soup, 'name')

    for a in soup.select('[href^="#"]'):
        href = a['href']
        if a.has_attr('class') and "mjx-svg-href" in a['class']:
            msg = 'Invalid math reference (sorry, no details): href = %s .' % href
            logger.warning(msg)
            a.insert_before(Comment('Error: %s' % msg))
            math_errors.append(msg)
            continue
        assert href.startswith('#')
        ID = href[1:]
        # remove query if it exists
        if '?' in ID:
            ID = ID[:ID.index('?')]

        if not ID in id2element:
            # try to fix it

            # if there is already a prefix, remove it
            if ':' in href:
                i = href.index(':')
                core = href[i + 1:]
            else:
                core = ID

#             logger.debug('check_if_any_href_is_invalid: not found %r, core %r' % (ID, core))

            possible = [
                'part',
                'sec',
                'sub',
                'subsub',
                'fig',
                'tab',
                'code',
                'app',
                'appsub',
                'appsubsub',
                'def',
                'eq',
                'rem',
                'lem',
                'prob',
                'prop',
                'exa',
                'thm',
                #                         'bib'
            ]
            matches = []
            others = []
            for possible_prefix in possible:
                why_not = possible_prefix + ':' + core
                others.append(why_not)
                if why_not in id2element:
                    matches.append(why_not)


#             logger.debug('others = %r, matches = %r' % (others, matches))

            if len(matches) > 1:
                short = 'Ref. error'
                msg = '%s not found, and multiple matches for heuristics (%s)' % (
                    href, matches)
                note_error2(a, short, msg,
                            ['href-invalid', 'href-invalid-missing'])

            elif len(matches) == 1:

                a['href'] = '#' + matches[0]

                if show_debug_message_for_corrected_links:
                    short = 'Ref replaced'
                    msg = '%s not found, but corrected in %s' % (href,
                                                                 matches[0])
                    note_warning2(a, short, msg, ['href-replaced'])

            else:
                if has_class(a, MCDPConstants.CLASS_IGNORE_IF_NOT_EXISTENT):
                    pass
                else:
                    short = 'Ref. error'
                    #                 msg = 'Not found %r (also tried %s)' % (href, ", ".join(others))
                    msg = 'I do not know the link that is indicated by the link %r.' % href
                    note_error2(a, short, msg,
                                ['href-invalid', 'href-invalid-missing'])
                    errors.append(msg)

        if ID in duplicates:
            msg = 'More than one element matching %r.' % href
            short = 'Ref. error'
            note_error2(a, short, msg,
                        ['href-invalid', 'href-invalid-multiple'])
            errors.append(msg)

    return errors, math_errors
Пример #9
0
def write_comment(file_object, comment_text):
    if not const.NO_COMMENTS:
        comment = Comment(comment_text)
        comment.setup()  # workaround for BeautifulSoup issue
        file_object.write(comment.output_ready())
Пример #10
0
def write_comment(file_object, comment_text):
    if not const.NO_COMMENTS:
        comment = Comment(comment_text)
        comment.setup() # workaround for BeautifulSoup issue
        file_object.write(comment.output_ready())
Пример #11
0
    def make_sections(body,
                      is_marker,
                      preserve=lambda _: False,
                      element_name='section',
                      copy=True,
                      attrs={}):
        sections = []

        def make_new():
            x = Tag(name=element_name)
            for k, v in attrs.items():
                x.attrs[k] = v
            return x

        current_section = make_new()
        current_section['id'] = 'before-any-match-of-%s' % is_marker.__name__
        current_section['class'] = 'without-header-inside'
        #         sections.append(current_section)
        for x in body.contents:
            if is_marker(x):
                #print('starting %s' % str(x))
                if contains_something_else_than_space(current_section):
                    sections.append(current_section)
                current_section = make_new()
                current_section['id'] = x.attrs.get(
                    'id', 'unnamed-h1') + ':' + element_name
                logger.debug('marker %s' % current_section['id'])
                current_section['class'] = x.attrs.get('class', '')
                #print('%s/section %s %s' % (is_marker.__name__, x.attrs.get('id','unnamed'), current_section['id']))
                current_section.append(x.__copy__())
                current_section['class'] = 'with-header-inside'
            elif preserve(x):
                if contains_something_else_than_space(current_section):
                    sections.append(current_section)

                #current_section['id'] = x.attrs.get('id', 'unnamed-h1') + ':' + element_name
                #print('%s/preserve %s' % (preserve.__name__, current_section['id']))
                sections.append(x.__copy__())
                current_section = make_new()
                current_section.attrs['comment'] = "Triggered by %r" % x
            else:
                #x2 = x.__copy__() if copy else x
                x2 = x.__copy__() if copy else x.extract()
                current_section.append(x2)
        if contains_something_else_than_space(current_section):
            sections.append(current_section)  # XXX
        new_body = Tag(name=body.name)
        #         if len(sections) < 3:
        #             msg = 'Only %d sections found (%s).' % (len(sections), is_marker.__name__)
        #             raise ValueError(msg)

        logger.info('make_sections: %s found using marker %s' %
                    (len(sections), is_marker.__name__))
        for i, s in enumerate(sections):
            if add_debug_comments:
                new_body.append('\n')
                new_body.append(
                    Comment('Start of %s section %d/%d' %
                            (is_marker.__name__, i, len(sections))))
            new_body.append('\n')
            new_body.append(s)
            new_body.append('\n')
            if add_debug_comments:
                new_body.append(
                    Comment('End of %s section %d/%d' %
                            (is_marker.__name__, i, len(sections))))
                new_body.append('\n')
        return new_body
Пример #12
0
def manual_join(template,
                files_contents,
                bibfile,
                stylesheet,
                remove=None,
                extra_css=None,
                remove_selectors=None,
                hook_before_toc=None):
    """
        extra_css: if not None, a string of more CSS to be added
        Remove_selectors: list of selectors to remove (e.g. ".draft").

        hook_before_toc if not None is called with hook_before_toc(soup=soup)
        just before generating the toc
    """
    logger.debug('remove_selectors: %s' % remove_selectors)
    logger.debug('remove: %s' % remove)
    from mcdp_utils_xml import bs

    template = replace_macros(template)

    # cannot use bs because entire document
    template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8')
    d = template_soup
    assert d.html is not None
    assert '<html' in str(d)
    head = d.find('head')
    assert head is not None
    for x in get_manual_css_frag().contents:
        head.append(x.__copy__())

    if stylesheet is not None:
        link = Tag(name='link')
        link['rel'] = 'stylesheet'
        link['type'] = 'text/css'
        from mcdp_report.html import get_css_filename
        link['href'] = get_css_filename('compiled/%s' % stylesheet)
        head.append(link)

    basename2soup = OrderedDict()
    for (_libname, docname), data in files_contents:
        frag = bs(data)
        basename2soup[docname] = frag

    fix_duplicated_ids(basename2soup)

    body = d.find('body')
    add_comments = False
    for docname, content in basename2soup.items():
        logger.debug('docname %r -> %s KB' % (docname, len(data) / 1024))
        from mcdp_docs.latex.latex_preprocess import assert_not_inside
        assert_not_inside(data, 'DOCTYPE')
        if add_comments:
            body.append(NavigableString('\n\n'))
            body.append(Comment('Beginning of document dump of %r' % docname))
            body.append(NavigableString('\n\n'))
        for x in content:
            x2 = x.__copy__()  # not clone, not extract
            body.append(x2)
        if add_comments:
            body.append(NavigableString('\n\n'))
            body.append(Comment('End of document dump of %r' % docname))
            body.append(NavigableString('\n\n'))

    extract_bibtex_blocks(d)
    logger.info('external bib')
    if bibfile is not None:
        if not os.path.exists(bibfile):
            logger.error('Cannot find bib file %s' % bibfile)
        else:
            bibliography_entries = get_bibliography(bibfile)
            bibliography_entries['id'] = 'bibliography_entries'
            body.append(bibliography_entries)

    bibhere = d.find('div', id='put-bibliography-here')
    if bibhere is None:
        logger.warning('Could not find #put-bibliography-here in document.'
                       'Adding one at end of document')
        bibhere = Tag(name='div')
        bibhere.attrs['id'] = 'put-bibliography-here'
        d.find('body').append(bibhere)

    do_bib(d, bibhere)

    if True:
        logger.info('reorganizing contents in <sections>')
        body2 = reorganize_contents(d.find('body'))
        body.replace_with(body2)
    else:
        warnings.warn('fix')
        body2 = body

    # Removing
    all_selectors = []
    if remove is not None and remove != '':
        all_selectors.append(remove)
    if remove_selectors:
        all_selectors.extend(remove_selectors)

    logger.debug('all_selectors: %s' % all_selectors)

    all_removed = ''
    for selector in all_selectors:
        nremoved = 0
        logger.debug('Removing selector %r' % remove)
        toremove = list(body2.select(selector))
        logger.debug('Removing %d objects' % len(toremove))
        for x in toremove:
            nremoved += 1
            nd = len(list(x.descendants))
            logger.debug('removing %s with %s descendants' % (x.name, nd))
            if nd > 1000:
                s = str(x)[:300]
                logger.debug(' it is %s' % s)
            x.extract()

            all_removed += '\n\n' + '-' * 50 + ' chunk %d removed\n' % nremoved
            all_removed += str(x)
            all_removed += '\n\n' + '-' * 100 + '\n\n'

        logger.info('Removed %d elements of selector %r' % (nremoved, remove))


#     if False:
    with open('all_removed.html', 'w') as f:
        f.write(all_removed)

    if hook_before_toc is not None:
        hook_before_toc(soup=d)
    ###
    logger.info('adding toc')
    toc = generate_toc(body2)

    logger.info('TOC:\n' + str(toc))
    toc_ul = bs(toc).ul
    toc_ul.extract()
    assert toc_ul.name == 'ul'
    toc_ul['class'] = 'toc'
    toc_ul['id'] = 'main_toc'
    toc_selector = 'div#toc'
    tocs = list(d.select(toc_selector))
    if not tocs:
        msg = 'Cannot find any element of type %r to put TOC inside.' % toc_selector
        logger.warning(msg)
    else:
        toc_place = tocs[0]
        toc_place.replaceWith(toc_ul)

    logger.info('checking errors')
    check_various_errors(d)

    from mcdp_docs.check_missing_links import check_if_any_href_is_invalid
    logger.info('checking hrefs')
    check_if_any_href_is_invalid(d)

    # Note that this should be done *after* check_if_any_href_is_invalid()
    # because that one might fix some references
    logger.info('substituting empty links')
    substituting_empty_links(d)

    warn_for_duplicated_ids(d)

    if extra_css is not None:
        logger.info('adding extra CSS')
        add_extra_css(d, extra_css)

    add_footnote_polyfill(d)

    logger.info('converting to string')
    # do not use to_html_stripping_fragment - this is a complete doc
    res = unicode(d)
    res = res.encode('utf8')
    logger.info('done - %d bytes' % len(res))
    return res
Пример #13
0
def check_if_any_href_is_invalid(soup):
    '''
         Checks if references are invalid and tries to correct them. 
         
        if it is of the form "#frag?query" then query is stripped out
    '''
    errors = []
    math_errors = []
    
    # let's first find all the IDs
    id2element, duplicates = get_id2element(soup, 'id')
    _name2element, _duplicates = get_id2element(soup, 'name')
#     id2element.update(name2element)
#     for a in soup.select('a[href^="#"]'):

    for a in soup.select('[href^="#"]'):
        href = a['href']
        if a.has_attr('class') and  "mjx-svg-href" in a['class']:
            msg = 'Invalid math reference (sorry, no details): href = %s .' % href
            logger.error(msg)
            a.insert_before(Comment('Error: %s' % msg))
            math_errors.append(msg)
            continue 
        assert href.startswith('#')
        ID = href[1:]
        # remove query if it exists
        if '?' in ID:
            ID = ID[:ID.index('?')]
#         not_found = []

        if not ID in id2element:
            # try to fix it
#             
#             # it there is named element
#             if ID in name2element:
#                 real_id = name2element[ID].attrs
            
            # if there is already a prefix, remove it 
            if ':' in href:
                i = href.index(':')
                core = href[i+1:]
            else:
                core = ID
            possible = ['sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub',
                        'appsubsub',
                        'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm' ]
            matches = [] 
            others = []
            for possible_prefix in possible:
                why_not = possible_prefix + ':' + core
                others.append(why_not)
                if why_not in id2element:
                    matches.append(why_not)
            
            if len(matches) > 1:
                msg = '%s not found, and multiple matches for heuristics (%s)' % (href, matches)
                logger.error(msg)
                add_class(a, 'errored')
                w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'})
                w.string = msg
                a.insert_after(w)
            elif len(matches) == 1:
                msg = '%s not found, but corrected in %s' % (href, matches[0])
                logger.debug(msg)
                
                add_class(a, 'warning')
                w = Tag(name='span', attrs={'class':'href-replaced'})
                w.string = msg
                a['href'] = '#' + matches[0]
                a.insert_after(w)
                
            else:
#                 msg = 'Not found %r (also tried %s)' % (href, ", ".join(others))
#                 not_found.append(ID)
#                 logger.error(msg)
                errors.append('Not found %r' % (href))
                if not 'errored' in a.attrs.get('class', ''):
                    add_class(a, 'errored')
                    w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'})
                    w.string = 'Not found %r' % (href)
                    a.insert_after(w)
            
        if ID in duplicates:
            msg = 'More than one element matching %r.' % href
            logger.error(msg)
            if not 'errored' in a.attrs.get('class', ''):
                add_class(a, 'errored')
                w = Tag(name='span', attrs={'class':'href-invalid href-invalid-multiple'})
                w.string = msg
                a.insert_after(w)

            errors.append(msg)
            
    return errors, math_errors
Пример #14
0
def manual_join(template,
                files_contents,
                stylesheet,
                remove=None,
                extra_css=None,
                remove_selectors=None,
                hook_before_toc=None,
                references={},
                resolve_references=True):
    """
        files_contents: a list of tuples that can be cast to DocToJoin:
        where the string is a unique one to be used for job naming.

        extra_css: if not None, a string of more CSS to be added
        Remove_selectors: list of selectors to remove (e.g. ".draft").

        hook_before_toc if not None is called with hook_before_toc(soup=soup)
        just before generating the toc
    """
    check_isinstance(files_contents, list)

    files_contents = [DocToJoin(*_) for _ in files_contents]

    template0 = template
    template = replace_macros(template)

    # cannot use bs because entire document
    template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8')
    d = template_soup
    if d.html is None:
        s = "Invalid template"
        raise_desc(ValueError, s, template0=template0)

    assert d.html is not None
    assert '<html' in str(d)
    head = d.find('head')
    assert head is not None
    for x in get_manual_css_frag().contents:
        head.append(x.__copy__())

    if stylesheet is not None:
        link = Tag(name='link')
        link['rel'] = 'stylesheet'
        link['type'] = 'text/css'
        from mcdp_report.html import get_css_filename
        link['href'] = get_css_filename('compiled/%s' % stylesheet)
        head.append(link)

    basename2soup = OrderedDict()
    for doc_to_join in files_contents:
        if doc_to_join.docname in basename2soup:
            msg = 'Repeated docname %r' % doc_to_join.docname
            raise ValueError(msg)
        from .latex.latex_preprocess import assert_not_inside
        assert_not_inside(doc_to_join.contents, '<fragment')
        assert_not_inside(doc_to_join.contents, 'DOCTYPE')

        frag = bs(doc_to_join.contents)
        basename2soup[doc_to_join.docname] = frag

    fix_duplicated_ids(basename2soup)

    body = d.find('body')
    add_comments = False
    for docname, content in basename2soup.items():
        #         logger.debug('docname %r -> %s KB' % (docname, len(data) / 1024))
        if add_comments:
            body.append(NavigableString('\n\n'))
            body.append(Comment('Beginning of document dump of %r' % docname))
            body.append(NavigableString('\n\n'))

        copy_contents_into(content, body)

        f = body.find('fragment')
        if f:
            msg = 'I found a <fragment> in the manual after %r' % docname
            msg += '\n\n' + indent(str(content), '> ')
            raise Exception(msg)

        if add_comments:
            body.append(NavigableString('\n\n'))
            body.append(Comment('End of document dump of %r' % docname))
            body.append(NavigableString('\n\n'))

    extract_bibtex_blocks(d)
    logger.info('external bib')

    ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE

    bibhere = d.find('div', id=ID_PUT_BIB_HERE)
    if bibhere is None:
        logger.warning(('Could not find #%s in document. '
                        'Adding one at end of document.') % ID_PUT_BIB_HERE)
        bibhere = Tag(name='div')
        bibhere.attrs['id'] = ID_PUT_BIB_HERE
        d.find('body').append(bibhere)

    do_bib(d, bibhere)

    document_final_pass_before_toc(d, remove, remove_selectors)

    if hook_before_toc is not None:
        hook_before_toc(soup=d)

    generate_and_add_toc(d)

    document_final_pass_after_toc(soup=d,
                                  resolve_references=resolve_references)

    if extra_css is not None:
        logger.info('adding extra CSS')
        add_extra_css(d, extra_css)

    document_only_once(d)

    for a in d.select('[href]'):
        href = a.attrs['href']
        if href in references:
            r = references[href]
            a.attrs['href'] = r.url
            if not a.children:  # empty
                a.append(r.title)

    logger.info('converting to string')
    # do not use to_html_stripping_fragment - this is a complete doc
    res = unicode(d)
    res = res.encode('utf8')
    logger.info('done - %d bytes' % len(res))
    return res
Пример #15
0
def remove_js(content):
    '''remove js inclusion'''
    ### Bit of cleanup
    scripts = content.find_all('script')
    for s in scripts:
        s.replace_with(Comment("script rimosso"))
Пример #16
0
 def commentClass(self, data):
     return TextNode(Comment(data), self.soup)
Пример #17
0
def remove_doctype_etc(fragment):
    for e in list(fragment):
        remove = (Declaration, ProcessingInstruction, Doctype)
        if isinstance(e, remove):
            c = Comment('Removed object of type %s' % type(e).__name__)
            e.replace_with(c)