def fix_header_id(header): ID = header.get('id', None) prefix = None if (ID is None or not ':' in ID) else ID[:ID.index(':')] allowed_prefixes_h = { 'h1': ['sec', 'app', 'part'], 'h2': ['sub', 'appsub'], 'h3': ['subsub', 'appsubsub'], 'h4': ['par'], } if header.name in allowed_prefixes_h: allowed_prefixes = allowed_prefixes_h[header.name] default_prefix = allowed_prefixes[0] if ID is None: header['id'] = '%s:%s' % (default_prefix, GlobalCounter.header_id) GlobalCounter.header_id += 1 else: if prefix is None: if ID != 'booktitle': msg = ('Adding prefix %r to current id %r for %s.' % (default_prefix, ID, header.name)) header.insert_before(Comment('Warning: ' + msg)) header['id'] = default_prefix + ':' + ID else: if prefix not in allowed_prefixes: msg = ('The prefix %r is not allowed for %s (ID=%r)' % (prefix, header.name, ID)) logger.error(msg) header.insert_after(Comment('Error: ' + msg))
def test_text_acquisition_methods(self): # These methods are intended for use against Tag, but they # work on NavigableString as well, s = NavigableString("fee ") cdata = CData("fie ") comment = Comment("foe ") assert "fee " == s.get_text() assert "fee" == s.get_text(strip=True) assert ["fee "] == list(s.strings) assert ["fee"] == list(s.stripped_strings) assert ["fee "] == list(s._all_strings()) assert "fie " == cdata.get_text() assert "fie" == cdata.get_text(strip=True) assert ["fie "] == list(cdata.strings) assert ["fie"] == list(cdata.stripped_strings) assert ["fie "] == list(cdata._all_strings()) # Since a Comment isn't normally considered 'text', # these methods generally do nothing. assert "" == comment.get_text() assert [] == list(comment.strings) assert [] == list(comment.stripped_strings) assert [] == list(comment._all_strings()) # Unless you specifically say that comments are okay. assert "foe" == comment.get_text(strip=True, types=Comment) assert "foe " == comment.get_text(types=(Comment, NavigableString))
def test_smooth(self): soup = self.soup("<div>a</div>") div = soup.div div.append("b") div.append("c") div.append(Comment("Comment 1")) div.append(Comment("Comment 2")) div.append("d") builder = self.default_builder() span = Tag(soup, builder, 'span') span.append('1') span.append('2') div.append(span) # At this point the tree has a bunch of adjacent # NavigableStrings. This is normal, but it has no meaning in # terms of HTML, so we may want to smooth things out for # output. # Since the <span> tag has two children, its .string is None. assert None == div.span.string assert 7 == len(div.contents) div.smooth() assert 5 == len(div.contents) # The three strings at the beginning of div.contents have been # merged into on string. # assert 'abc' == div.contents[0] # The call is recursive -- the <span> tag was also smoothed. assert '12' == div.span.string # The two comments have _not_ been merged, even though # comments are strings. Merging comments would change the # meaning of the HTML. assert 'Comment 1' == div.contents[1] assert 'Comment 2' == div.contents[2]
print(tag) print(tag.contents) print(tag.get_text()) new_b = soup.new_tag("b") new_b.string = "This is a new tag " tag.insert(0,new_b) tag.smooth() print(tag.encode()) print(tag.decode()) """ printer = pprint.PrettyPrinter() printer.pprint(soup.head) """ # formatter from bs4.formatter import HTMLFormatter def uppercase(str): return str.upper() formatter = HTMLFormatter(uppercase) print(tag.prettify(formatter=formatter)) # comments from bs4.element import Comment my_comment = Comment("This is a comment") tag.insert(2,my_comment) print(tag.prettify(formatter=formatter)) print(tag.get_text()) print(soup("a") == soup.find_all("a"))
def make_videos_(o, res, location, raise_on_errors): if 'src' not in o.attrs: msg = 'The video does not have a "src" attribute.' res.note_error(msg, HTMLIDLocation.for_element(o, location)) return # raise_desc(ValueError, msg, element=str(o)) src = o.attrs['src'] prefix = 'vimeo:' if not src.startswith(prefix): msg = 'Invalid src attribute "%s": it does not start with %r.' % ( src, prefix) res.note_error(msg, HTMLIDLocation.for_element(o, location)) return # raise_desc(ValueError, msg, element=str(o)) vimeo_id = src[len(prefix):] # <iframe src="https://player.vimeo.com/video/152233002" # class="embed-responsive-item" # frameborder="0" webkitallowfullscreen="" mozallowfullscreen="" allowfullscreen=""> try: vimeo_info = get_vimeo_info(vimeo_id) except VimeoInfoException as e: if raise_on_errors: raise else: msg = str(e) # note_error2(o, 'Resource error', str(e)) res.note_error(msg, HTMLIDLocation.for_element(o, location)) return d = Tag(name='div') d.attrs['class'] = 'video' ONLY_WEB = 'only-web' ONLY_EBOOK = 'only-ebook' ONLY_DEADTREE = 'only-deadtree' d.append(Comment('This is the iframe, for online playing.')) C = Tag(name='div') C.attrs['class'] = ONLY_WEB if True: r = Tag(name='iframe') r.attrs['class'] = 'video-vimeo-player' r.attrs['src'] = 'https://player.vimeo.com/video/' + vimeo_id r.attrs['frameborder'] = 0 r.attrs['webkitallowfullscreen'] = 1 r.attrs['mozallowfullscreen'] = 1 r.attrs['allowfullscreen'] = 1 C.append(r) d.append(C) d.append(Comment('This is the thumbnail, for ebook')) C = Tag(name='div') C.attrs['class'] = ONLY_EBOOK if True: a = Tag(name='a') a.attrs['href'] = vimeo_info.url img = Tag(name='img') img.attrs['class'] = 'video-vimeo-thumbnail-ebook' img.attrs['src'] = vimeo_info.thumbnail_large img.attrs['title'] = vimeo_info.title a.append(img) C.append(a) d.append(C) d.append(Comment('This is the textual version for printing.')) C = Tag(name='div') C.attrs['class'] = ONLY_DEADTREE if True: img = Tag(name='img') img.attrs['class'] = 'video-vimeo-thumbnail-deadtree' img.attrs['src'] = vimeo_info.thumbnail_large img.attrs['title'] = vimeo_info.title C.append(img) p = Tag(name='p') p.append("The video is at %s." % vimeo_info.url) C.append(p) d.append(C) for att in ['style']: if att in o.attrs: d.attrs[att] = o.attrs[att] o.replace_with(d)
def manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None, references=None, resolve_references=True, hook_before_final_pass=None, require_toc_placeholder=False, permalink_prefix=None, crossrefs_aug=None, aug0=None): """ files_contents: a list of tuples that can be cast to DocToJoin: where the string is a unique one to be used for job naming. extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ result = AugmentedResult() if references is None: references = {} check_isinstance(files_contents, list) if crossrefs_aug is None: crossrefs = Tag(name='no-cross-refs') else: crossrefs = bs(crossrefs_aug.get_result()) result.merge(crossrefs_aug) if aug0 is not None: result.merge(aug0) @contextmanager def timeit(_): yield with timeit('manual_join'): files_contents = [DocToJoin(*_) for _ in files_contents] # cannot use bs because entire document with timeit('parsing template'): template0 = template template = replace_macros(template) template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup if d.html is None: s = "Invalid template" raise_desc(ValueError, s, template0=template0) with timeit('adding head'): assert d.html is not None assert '<html' in str(d) head = d.find('head') if head is None: msg = 'Could not find <head> in template:' logger.error(msg) logger.error(str(d)) raise Exception(msg) assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) with timeit('adding stylesheet'): if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) with timeit('making basename2soup'): basename2soup = OrderedDict() for doc_to_join in files_contents: if doc_to_join.docname in basename2soup: msg = 'Repeated docname %r' % doc_to_join.docname raise ValueError(msg) from .latex.latex_preprocess import assert_not_inside if isinstance(doc_to_join.contents, AugmentedResult): result.merge(doc_to_join.contents) contents = doc_to_join.contents.get_result() else: contents = doc_to_join.contents assert_not_inside(contents, '<fragment') assert_not_inside(contents, 'DOCTYPE') frag = bs(contents) basename2soup[doc_to_join.docname] = frag # with timeit('fix_duplicate_ids'): # XXX # fix_duplicated_ids(basename2soup) with timeit('copy contents'): body = d.find('body') add_comments = False for docname, content in basename2soup.items(): if add_comments: body.append(NavigableString('\n\n')) body.append( Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) try_faster = True if try_faster: for e in list(content.children): body.append(e.extract()) else: copy_contents_into(content, body) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) with timeit('extract_bibtex_blocks'): extract_bibtex_blocks(d) with timeit('ID_PUT_BIB_HERE'): ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE bibhere = d.find('div', id=ID_PUT_BIB_HERE) if bibhere is None: msg = ('Could not find #%s in document. ' 'Adding one at end of document.') % ID_PUT_BIB_HERE result.note_warning(msg) bibhere = Tag(name='div') bibhere.attrs['id'] = ID_PUT_BIB_HERE d.find('body').append(bibhere) do_bib(d, bibhere) with timeit('hook_before_final_pass'): if hook_before_final_pass is not None: hook_before_final_pass(soup=d) with timeit('document_final_pass_before_toc'): location = LocationUnknown() document_final_pass_before_toc(d, remove, remove_selectors, result, location) with timeit('hook_before_toc'): if hook_before_toc is not None: hook_before_toc(soup=d) with timeit('generate_and_add_toc'): try: generate_and_add_toc(d, raise_error=True, res=result) except NoTocPlaceholder as e: if require_toc_placeholder: msg = 'Could not find toc placeholder: %s' % e # logger.error(msg) if aug0 is not None: result.note_error(msg) else: raise Exception(msg) with timeit('document_final_pass_after_toc'): document_final_pass_after_toc( soup=d, crossrefs=crossrefs, resolve_references=resolve_references, res=result) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) with timeit('document_only_once'): document_only_once(d) location = LocationUnknown() substitute_github_refs(d, defaults={}, res=result, location=location) with timeit('another A pass'): for a in d.select('a[href]'): href = a.attrs['href'] if href in references: r = references[href] a.attrs['href'] = r.url if not a.children: # empty a.append(r.title) # do not use to_html_stripping_fragment - this is a complete doc # mark_in_html(result, soup=d) add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix) with timeit('converting to string'): res = unicode(d) with timeit('encoding'): res = res.encode('utf8') logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0))) result.set_result(res) return result
def col_macro_(e, ncols): """ Bug: For some reasone bd4 removes the whitespace I use for indentation. """ assert e.name == 'div' assert e.has_attr('make-col%d' % ncols) # print describe_tag(e) children = list(e.children) # remove strings from this is_string = lambda x: isinstance(x, NavigableString) strings = [_ for _ in children if is_string(_)] children = [_ for _ in children if not is_string(_)] if len(children) < ncols: msg = ('Cannot create table with %r cols with only %d children' % (ncols, len(children))) raise_desc(ValueError, msg, tag=describe_tag(e)) for c in children: c.extract() for s in strings: ss = str(s) empty = not ss.strip() if not empty: msg = 'Found nonempty string %r between children.' % ss raise_desc(ValueError, msg, tag=describe_tag(e)) # remove it s.extract() nchildren = len(children) nrows = int(math.ceil(nchildren / float(ncols))) parent = e.parent original_position = parent.index(e) e.extract() table = e e.name = 'table' add_class(table, 'col%d' % ncols) add_class(table, 'colN') wrapper = Tag(name='div') add_class(wrapper, 'col%d-wrap' % ncols) add_class(wrapper, 'colN-wrap') NL = '\n' # S = '-' * 4 # XXX: change to above to see the problem with indentation S = ' ' * 4 tbody = Tag(name='tbody') for row in range(nrows): tbody.append(NavigableString(NL)) tbody.append(NavigableString(S+S)) tr = Tag(name='tr') tr.append(NavigableString(NL)) for col in range(ncols): td = Tag(name='td') i = col + row * ncols if i < len(children): child = children[i] td.append(child) else: td.append(Comment('empty row %d col %d' % (row, col))) tr.append(NavigableString(S+S+S)) tr.append(td) tr.append(NavigableString(NL)) tr.append(S+S) if row == 0 and ('labels-row1' in e.attrs.get('class', '')): thead = Tag(name='thead') thead.append(tr) table.append(thead) # add in table, not tbody else: tbody.append(tr) # add in tbody tbody.append(NavigableString(NL+S)) table.append(tbody) wrapper.append(NavigableString(NL + S)) wrapper.append(table) wrapper.append(NavigableString(NL)) parent.insert(original_position, wrapper)
def check_if_any_href_is_invalid(soup): ''' Checks if references are invalid and tries to correct them. if it is of the form "#frag?query" then query is stripped out ''' logger.debug('check_if_any_href_is_invalid') errors = [] math_errors = [] # let's first find all the IDs id2element, duplicates = get_id2element(soup, 'id') _name2element, _duplicates = get_id2element(soup, 'name') for a in soup.select('[href^="#"]'): href = a['href'] if a.has_attr('class') and "mjx-svg-href" in a['class']: msg = 'Invalid math reference (sorry, no details): href = %s .' % href logger.warning(msg) a.insert_before(Comment('Error: %s' % msg)) math_errors.append(msg) continue assert href.startswith('#') ID = href[1:] # remove query if it exists if '?' in ID: ID = ID[:ID.index('?')] if not ID in id2element: # try to fix it # if there is already a prefix, remove it if ':' in href: i = href.index(':') core = href[i + 1:] else: core = ID # logger.debug('check_if_any_href_is_invalid: not found %r, core %r' % (ID, core)) possible = [ 'part', 'sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub', 'appsubsub', 'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm', # 'bib' ] matches = [] others = [] for possible_prefix in possible: why_not = possible_prefix + ':' + core others.append(why_not) if why_not in id2element: matches.append(why_not) # logger.debug('others = %r, matches = %r' % (others, matches)) if len(matches) > 1: short = 'Ref. error' msg = '%s not found, and multiple matches for heuristics (%s)' % ( href, matches) note_error2(a, short, msg, ['href-invalid', 'href-invalid-missing']) elif len(matches) == 1: a['href'] = '#' + matches[0] if show_debug_message_for_corrected_links: short = 'Ref replaced' msg = '%s not found, but corrected in %s' % (href, matches[0]) note_warning2(a, short, msg, ['href-replaced']) else: if has_class(a, MCDPConstants.CLASS_IGNORE_IF_NOT_EXISTENT): pass else: short = 'Ref. error' # msg = 'Not found %r (also tried %s)' % (href, ", ".join(others)) msg = 'I do not know the link that is indicated by the link %r.' % href note_error2(a, short, msg, ['href-invalid', 'href-invalid-missing']) errors.append(msg) if ID in duplicates: msg = 'More than one element matching %r.' % href short = 'Ref. error' note_error2(a, short, msg, ['href-invalid', 'href-invalid-multiple']) errors.append(msg) return errors, math_errors
def write_comment(file_object, comment_text): if not const.NO_COMMENTS: comment = Comment(comment_text) comment.setup() # workaround for BeautifulSoup issue file_object.write(comment.output_ready())
def make_sections(body, is_marker, preserve=lambda _: False, element_name='section', copy=True, attrs={}): sections = [] def make_new(): x = Tag(name=element_name) for k, v in attrs.items(): x.attrs[k] = v return x current_section = make_new() current_section['id'] = 'before-any-match-of-%s' % is_marker.__name__ current_section['class'] = 'without-header-inside' # sections.append(current_section) for x in body.contents: if is_marker(x): #print('starting %s' % str(x)) if contains_something_else_than_space(current_section): sections.append(current_section) current_section = make_new() current_section['id'] = x.attrs.get( 'id', 'unnamed-h1') + ':' + element_name logger.debug('marker %s' % current_section['id']) current_section['class'] = x.attrs.get('class', '') #print('%s/section %s %s' % (is_marker.__name__, x.attrs.get('id','unnamed'), current_section['id'])) current_section.append(x.__copy__()) current_section['class'] = 'with-header-inside' elif preserve(x): if contains_something_else_than_space(current_section): sections.append(current_section) #current_section['id'] = x.attrs.get('id', 'unnamed-h1') + ':' + element_name #print('%s/preserve %s' % (preserve.__name__, current_section['id'])) sections.append(x.__copy__()) current_section = make_new() current_section.attrs['comment'] = "Triggered by %r" % x else: #x2 = x.__copy__() if copy else x x2 = x.__copy__() if copy else x.extract() current_section.append(x2) if contains_something_else_than_space(current_section): sections.append(current_section) # XXX new_body = Tag(name=body.name) # if len(sections) < 3: # msg = 'Only %d sections found (%s).' % (len(sections), is_marker.__name__) # raise ValueError(msg) logger.info('make_sections: %s found using marker %s' % (len(sections), is_marker.__name__)) for i, s in enumerate(sections): if add_debug_comments: new_body.append('\n') new_body.append( Comment('Start of %s section %d/%d' % (is_marker.__name__, i, len(sections)))) new_body.append('\n') new_body.append(s) new_body.append('\n') if add_debug_comments: new_body.append( Comment('End of %s section %d/%d' % (is_marker.__name__, i, len(sections)))) new_body.append('\n') return new_body
def manual_join(template, files_contents, bibfile, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None): """ extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ logger.debug('remove_selectors: %s' % remove_selectors) logger.debug('remove: %s' % remove) from mcdp_utils_xml import bs template = replace_macros(template) # cannot use bs because entire document template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup assert d.html is not None assert '<html' in str(d) head = d.find('head') assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) basename2soup = OrderedDict() for (_libname, docname), data in files_contents: frag = bs(data) basename2soup[docname] = frag fix_duplicated_ids(basename2soup) body = d.find('body') add_comments = False for docname, content in basename2soup.items(): logger.debug('docname %r -> %s KB' % (docname, len(data) / 1024)) from mcdp_docs.latex.latex_preprocess import assert_not_inside assert_not_inside(data, 'DOCTYPE') if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) for x in content: x2 = x.__copy__() # not clone, not extract body.append(x2) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) extract_bibtex_blocks(d) logger.info('external bib') if bibfile is not None: if not os.path.exists(bibfile): logger.error('Cannot find bib file %s' % bibfile) else: bibliography_entries = get_bibliography(bibfile) bibliography_entries['id'] = 'bibliography_entries' body.append(bibliography_entries) bibhere = d.find('div', id='put-bibliography-here') if bibhere is None: logger.warning('Could not find #put-bibliography-here in document.' 'Adding one at end of document') bibhere = Tag(name='div') bibhere.attrs['id'] = 'put-bibliography-here' d.find('body').append(bibhere) do_bib(d, bibhere) if True: logger.info('reorganizing contents in <sections>') body2 = reorganize_contents(d.find('body')) body.replace_with(body2) else: warnings.warn('fix') body2 = body # Removing all_selectors = [] if remove is not None and remove != '': all_selectors.append(remove) if remove_selectors: all_selectors.extend(remove_selectors) logger.debug('all_selectors: %s' % all_selectors) all_removed = '' for selector in all_selectors: nremoved = 0 logger.debug('Removing selector %r' % remove) toremove = list(body2.select(selector)) logger.debug('Removing %d objects' % len(toremove)) for x in toremove: nremoved += 1 nd = len(list(x.descendants)) logger.debug('removing %s with %s descendants' % (x.name, nd)) if nd > 1000: s = str(x)[:300] logger.debug(' it is %s' % s) x.extract() all_removed += '\n\n' + '-' * 50 + ' chunk %d removed\n' % nremoved all_removed += str(x) all_removed += '\n\n' + '-' * 100 + '\n\n' logger.info('Removed %d elements of selector %r' % (nremoved, remove)) # if False: with open('all_removed.html', 'w') as f: f.write(all_removed) if hook_before_toc is not None: hook_before_toc(soup=d) ### logger.info('adding toc') toc = generate_toc(body2) logger.info('TOC:\n' + str(toc)) toc_ul = bs(toc).ul toc_ul.extract() assert toc_ul.name == 'ul' toc_ul['class'] = 'toc' toc_ul['id'] = 'main_toc' toc_selector = 'div#toc' tocs = list(d.select(toc_selector)) if not tocs: msg = 'Cannot find any element of type %r to put TOC inside.' % toc_selector logger.warning(msg) else: toc_place = tocs[0] toc_place.replaceWith(toc_ul) logger.info('checking errors') check_various_errors(d) from mcdp_docs.check_missing_links import check_if_any_href_is_invalid logger.info('checking hrefs') check_if_any_href_is_invalid(d) # Note that this should be done *after* check_if_any_href_is_invalid() # because that one might fix some references logger.info('substituting empty links') substituting_empty_links(d) warn_for_duplicated_ids(d) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) add_footnote_polyfill(d) logger.info('converting to string') # do not use to_html_stripping_fragment - this is a complete doc res = unicode(d) res = res.encode('utf8') logger.info('done - %d bytes' % len(res)) return res
def check_if_any_href_is_invalid(soup): ''' Checks if references are invalid and tries to correct them. if it is of the form "#frag?query" then query is stripped out ''' errors = [] math_errors = [] # let's first find all the IDs id2element, duplicates = get_id2element(soup, 'id') _name2element, _duplicates = get_id2element(soup, 'name') # id2element.update(name2element) # for a in soup.select('a[href^="#"]'): for a in soup.select('[href^="#"]'): href = a['href'] if a.has_attr('class') and "mjx-svg-href" in a['class']: msg = 'Invalid math reference (sorry, no details): href = %s .' % href logger.error(msg) a.insert_before(Comment('Error: %s' % msg)) math_errors.append(msg) continue assert href.startswith('#') ID = href[1:] # remove query if it exists if '?' in ID: ID = ID[:ID.index('?')] # not_found = [] if not ID in id2element: # try to fix it # # # it there is named element # if ID in name2element: # real_id = name2element[ID].attrs # if there is already a prefix, remove it if ':' in href: i = href.index(':') core = href[i+1:] else: core = ID possible = ['sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub', 'appsubsub', 'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm' ] matches = [] others = [] for possible_prefix in possible: why_not = possible_prefix + ':' + core others.append(why_not) if why_not in id2element: matches.append(why_not) if len(matches) > 1: msg = '%s not found, and multiple matches for heuristics (%s)' % (href, matches) logger.error(msg) add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'}) w.string = msg a.insert_after(w) elif len(matches) == 1: msg = '%s not found, but corrected in %s' % (href, matches[0]) logger.debug(msg) add_class(a, 'warning') w = Tag(name='span', attrs={'class':'href-replaced'}) w.string = msg a['href'] = '#' + matches[0] a.insert_after(w) else: # msg = 'Not found %r (also tried %s)' % (href, ", ".join(others)) # not_found.append(ID) # logger.error(msg) errors.append('Not found %r' % (href)) if not 'errored' in a.attrs.get('class', ''): add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'}) w.string = 'Not found %r' % (href) a.insert_after(w) if ID in duplicates: msg = 'More than one element matching %r.' % href logger.error(msg) if not 'errored' in a.attrs.get('class', ''): add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-multiple'}) w.string = msg a.insert_after(w) errors.append(msg) return errors, math_errors
def manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None, references={}, resolve_references=True): """ files_contents: a list of tuples that can be cast to DocToJoin: where the string is a unique one to be used for job naming. extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ check_isinstance(files_contents, list) files_contents = [DocToJoin(*_) for _ in files_contents] template0 = template template = replace_macros(template) # cannot use bs because entire document template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup if d.html is None: s = "Invalid template" raise_desc(ValueError, s, template0=template0) assert d.html is not None assert '<html' in str(d) head = d.find('head') assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) basename2soup = OrderedDict() for doc_to_join in files_contents: if doc_to_join.docname in basename2soup: msg = 'Repeated docname %r' % doc_to_join.docname raise ValueError(msg) from .latex.latex_preprocess import assert_not_inside assert_not_inside(doc_to_join.contents, '<fragment') assert_not_inside(doc_to_join.contents, 'DOCTYPE') frag = bs(doc_to_join.contents) basename2soup[doc_to_join.docname] = frag fix_duplicated_ids(basename2soup) body = d.find('body') add_comments = False for docname, content in basename2soup.items(): # logger.debug('docname %r -> %s KB' % (docname, len(data) / 1024)) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) copy_contents_into(content, body) f = body.find('fragment') if f: msg = 'I found a <fragment> in the manual after %r' % docname msg += '\n\n' + indent(str(content), '> ') raise Exception(msg) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) extract_bibtex_blocks(d) logger.info('external bib') ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE bibhere = d.find('div', id=ID_PUT_BIB_HERE) if bibhere is None: logger.warning(('Could not find #%s in document. ' 'Adding one at end of document.') % ID_PUT_BIB_HERE) bibhere = Tag(name='div') bibhere.attrs['id'] = ID_PUT_BIB_HERE d.find('body').append(bibhere) do_bib(d, bibhere) document_final_pass_before_toc(d, remove, remove_selectors) if hook_before_toc is not None: hook_before_toc(soup=d) generate_and_add_toc(d) document_final_pass_after_toc(soup=d, resolve_references=resolve_references) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) document_only_once(d) for a in d.select('[href]'): href = a.attrs['href'] if href in references: r = references[href] a.attrs['href'] = r.url if not a.children: # empty a.append(r.title) logger.info('converting to string') # do not use to_html_stripping_fragment - this is a complete doc res = unicode(d) res = res.encode('utf8') logger.info('done - %d bytes' % len(res)) return res
def remove_js(content): '''remove js inclusion''' ### Bit of cleanup scripts = content.find_all('script') for s in scripts: s.replace_with(Comment("script rimosso"))
def commentClass(self, data): return TextNode(Comment(data), self.soup)
def remove_doctype_etc(fragment): for e in list(fragment): remove = (Declaration, ProcessingInstruction, Doctype) if isinstance(e, remove): c = Comment('Removed object of type %s' % type(e).__name__) e.replace_with(c)