def document_final_pass_after_toc(soup, crossrefs=None, resolve_references=True, res=None, location=LocationUnknown()): if res is None: res = AugmentedResult() """ This is done to a final document """ logger.info('checking errors') check_various_errors(soup) from .check_missing_links import check_if_any_href_is_invalid logger.info('checking hrefs') check_if_any_href_is_invalid(soup, res, location, extra_refs=crossrefs) # Note that this should be done *after* check_if_any_href_is_invalid() # because that one might fix some references if resolve_references: logger.info('substituting empty links') substituting_empty_links(soup, raise_errors=False, res=res, extra_refs=crossrefs) for a in soup.select('a[href_external]'): a.attrs['href'] = a.attrs['href_external'] add_class(a, 'interdoc') detect_duplicate_IDs(soup, res)
def figures_new1(): s = r""" <figure> <figcaption>Main caption</figcaption> <figure> <figcaption>Hello</figcaption> <img style='width:8em' src="duckietown-logo-transparent.png"/> </figure> <figure> <figcaption>second</figcaption> <img style='width:8em' src="duckietown-logo-transparent.png"/> </figure> </figure> """ soup = bs(s) res = AugmentedResult() location = LocationUnknown() make_figure_from_figureid_attr(soup, res, location) # nfigs = len(list(soup.select('figure'))) o = to_html_stripping_fragment(soup) print o
def document_final_pass_before_toc(soup, remove, remove_selectors, res=None, location=None): if res is None: logger.warn('no res passed') res = AugmentedResult() if location is None: location = LocationUnknown() logger.info('reorganizing contents in <sections>') with timeit('find body'): body = soup.find('body') if body is None: msg = 'Cannot find <body>:\n%s' % indent(str(soup)[:1000], '|') raise ValueError(msg) with timeit('reorganize_contents'): body2 = reorganize_contents(body) process_assignment(body2, res, location) body.replace_with(body2) # Removing stuff with timeit('remove stuff'): do_remove_stuff(body2, remove_selectors, remove) with timeit('move_things_around'): move_things_around(soup=soup, res=res)
def another2(): # four spaces in the first line s = r""" (if it exists) of the set of fixed points of~$f$: \begin{equation} x = y .\label{eq:lfp-one} \end{equation} The equality in \eqref{lfp-one} can be relaxed to ``$xxx$''. The equality in \ref{eq:lfp-one} can be relaxed to ``$xxx$''. The least fixed point need not exist. Monotonicity of the map~$f$ plus completeness is sufficient to ensure existence. """ res = AugmentedResult() location = LocationUnknown() s2 = censor_markdown_code_blocks(s, res, location) print('original:') print indent_plus_invisibles(s) print('later:') print indent_plus_invisibles(s2) assert not 'censored-code' in s
def elements_abbrevs_test2(): s = "<p>TODO: paragraph <strong>Strong</strong></p>" e = """<div class="todo-wrap"><p class="todo">TODO: paragraph <strong>Strong</strong></p></div>""" soup = bs(s.strip()) res = AugmentedResult() location = LocationUnknown() substitute_special_paragraphs(soup, res, location) o = to_html_stripping_fragment(soup) #print o assert_equal(o, e)
def test_toc(): s = """ <html> <head></head> <body> <h1 id='one'>One</h1> <p>a</p> <h2 id='two'>Two</h2> <p>a</p> <h3 id='three'>Three</h3> <h2 id='four'>Four</h2> <p>a</p> </body> </html> """ soup = bs(s) # print(soup) # body = soup.find('body') # first time it should fail try: _toc = generate_toc(soup) except InvalidHeaders as e: # > InvalidHeaders: I expected that this header would start with either part:,app:,sec:. # > <h1 id="one">One</h1> pass else: raise Exception() soup = bs(s) fix_ids_and_add_missing(soup, 'prefix-', AugmentedResult(), LocationUnknown()) generate_toc(soup) s = str(soup) expected = ['sec:one', 'sub:two'] # print(indent(s, 'transformed > ')) for e in expected: assert e in s
def test_toc2(): s = """ <html> <head></head> <body> <h1>One</h1> <h1>Two</h1> <h1>Three</h1> <p></p> <h2>A</h2> <h2>B</h2> <h2>C</h2> <h3>a</h3> <h3>b</h3> <h3>c</h3> </body> </html> """ soup = bs(s) # print(soup) # body = soup.find('body') fix_ids_and_add_missing(soup, 'prefix', AugmentedResult(), LocationUnknown()) assert soup.find(id='sub:prefix-5') is not None # <fragment> # <h1 id="sec:prefix--1">One</h1> # <h1 id="sec:prefix--2">Two</h1> # <h1 id="sec:prefix--3">Three</h1> # <p></p> # <h2 id="sub:prefix--4">A</h2> # <h2 id="sub:prefix--5">B</h2> # <h2 id="sub:prefix--6">C</h2> # <h3 id="subsub:prefix--7">a</h3> # <h3 id="subsub:prefix--8">b</h3> # <h3 id="subsub:prefix--9">c</h3> # </fragment> print(soup) _toc = generate_toc(soup) s = str(soup)
def sub2(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <a href="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> """ soup = bs(s) location = LocationUnknown() res = AugmentedResult() n = substitute_github_refs(soup, defaults, res=res, location=location) assert n == 1 s2 = str(soup) logger.debug('\n' + indent(s2, ' ')) expect = 'context_eval_as_constant.py#L7-L12' if not expect in s2: raise Exception('No %s in %s' % (expect, s2))
def sub1(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <a href="github:path=context_eval_as_constant.py"></a> """ soup = bs(s) location = LocationUnknown() res = AugmentedResult() n = substitute_github_refs(soup, defaults, res=res, location=location) assert n == 1 s2 = str(soup) logger.debug(indent(s2, ' ')) expect = '<code class="github-resource-link">context_eval_as_constant.py</code>' if not expect in s2: raise Exception(s2)
def displayfile1(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <display-file src="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> """ soup = bs(s) res = AugmentedResult() location = LocationUnknown() n = display_files(soup, defaults, raise_errors=True, res=res, location=location) assert n == 1 s2 = str(soup) logger.debug('\n' + indent(s2, ' '))
def manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None, references=None, resolve_references=True, hook_before_final_pass=None, require_toc_placeholder=False, permalink_prefix=None, crossrefs_aug=None, aug0=None): """ files_contents: a list of tuples that can be cast to DocToJoin: where the string is a unique one to be used for job naming. extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ result = AugmentedResult() if references is None: references = {} check_isinstance(files_contents, list) if crossrefs_aug is None: crossrefs = Tag(name='no-cross-refs') else: crossrefs = bs(crossrefs_aug.get_result()) result.merge(crossrefs_aug) if aug0 is not None: result.merge(aug0) @contextmanager def timeit(_): yield with timeit('manual_join'): files_contents = [DocToJoin(*_) for _ in files_contents] # cannot use bs because entire document with timeit('parsing template'): template0 = template template = replace_macros(template) template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup if d.html is None: s = "Invalid template" raise_desc(ValueError, s, template0=template0) with timeit('adding head'): assert d.html is not None assert '<html' in str(d) head = d.find('head') if head is None: msg = 'Could not find <head> in template:' logger.error(msg) logger.error(str(d)) raise Exception(msg) assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) with timeit('adding stylesheet'): if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) with timeit('making basename2soup'): basename2soup = OrderedDict() for doc_to_join in files_contents: if doc_to_join.docname in basename2soup: msg = 'Repeated docname %r' % doc_to_join.docname raise ValueError(msg) from .latex.latex_preprocess import assert_not_inside if isinstance(doc_to_join.contents, AugmentedResult): result.merge(doc_to_join.contents) contents = doc_to_join.contents.get_result() else: contents = doc_to_join.contents assert_not_inside(contents, '<fragment') assert_not_inside(contents, 'DOCTYPE') frag = bs(contents) basename2soup[doc_to_join.docname] = frag # with timeit('fix_duplicate_ids'): # XXX # fix_duplicated_ids(basename2soup) with timeit('copy contents'): body = d.find('body') add_comments = False for docname, content in basename2soup.items(): if add_comments: body.append(NavigableString('\n\n')) body.append( Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) try_faster = True if try_faster: for e in list(content.children): body.append(e.extract()) else: copy_contents_into(content, body) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) with timeit('extract_bibtex_blocks'): extract_bibtex_blocks(d) with timeit('ID_PUT_BIB_HERE'): ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE bibhere = d.find('div', id=ID_PUT_BIB_HERE) if bibhere is None: msg = ('Could not find #%s in document. ' 'Adding one at end of document.') % ID_PUT_BIB_HERE result.note_warning(msg) bibhere = Tag(name='div') bibhere.attrs['id'] = ID_PUT_BIB_HERE d.find('body').append(bibhere) do_bib(d, bibhere) with timeit('hook_before_final_pass'): if hook_before_final_pass is not None: hook_before_final_pass(soup=d) with timeit('document_final_pass_before_toc'): location = LocationUnknown() document_final_pass_before_toc(d, remove, remove_selectors, result, location) with timeit('hook_before_toc'): if hook_before_toc is not None: hook_before_toc(soup=d) with timeit('generate_and_add_toc'): try: generate_and_add_toc(d, raise_error=True, res=result) except NoTocPlaceholder as e: if require_toc_placeholder: msg = 'Could not find toc placeholder: %s' % e # logger.error(msg) if aug0 is not None: result.note_error(msg) else: raise Exception(msg) with timeit('document_final_pass_after_toc'): document_final_pass_after_toc( soup=d, crossrefs=crossrefs, resolve_references=resolve_references, res=result) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) with timeit('document_only_once'): document_only_once(d) location = LocationUnknown() substitute_github_refs(d, defaults={}, res=result, location=location) with timeit('another A pass'): for a in d.select('a[href]'): href = a.attrs['href'] if href in references: r = references[href] a.attrs['href'] = r.url if not a.children: # empty a.append(r.title) # do not use to_html_stripping_fragment - this is a complete doc # mark_in_html(result, soup=d) add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix) with timeit('converting to string'): res = unicode(d) with timeit('encoding'): res = res.encode('utf8') logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0))) result.set_result(res) return result