def update_refs(filename2contents): id2filename = {} for filename, contents in filename2contents.items(): for element in contents.findAll(id=True): id_ = element.attrs['id'] if id_ in id2filename: logger.error('double element with ID %s' % id_) id2filename[id_] = filename # also don't forget the id for the entire section if 'id' in contents.attrs: id_ = contents.attrs['id'] id2filename[id_] = filename # logger.info(id2filename) for filename, contents in filename2contents.items(): for a in contents.findAll( href=lambda x: x is not None and x.startswith('#')): href = a.attrs['href'] assert href[0] == '#' id_ = href[1:] # Todo, parse out "?" if id_ in id2filename: new_href = '%s#%s' % (id2filename[id_], id_) a.attrs['href'] = new_href else: logger.error('no elemement with ID %s' % id_)
def __init__(self, allow_or_deny, to_whom, privilege): self.allow_or_deny = allow_or_deny self.to_whom = to_whom self.privilege = privilege if not privilege in Privileges.ALL_PRIVILEGES: raise ValueError('Unknown privilege %r' % privilege) def valid_group(x): return len(x) > 0 def valid_username(x): return len(x) > 0 valid = False some_ok = [ MCDPConstants.EVERYONE, MCDPConstants.AUTHENTICATED, ] if to_whom in some_ok: valid = True elif to_whom.startswith('user:'******':') + 1:] valid = valid_username(username) elif to_whom.startswith('group:'): group = to_whom[to_whom.index(':') + 1:] valid = valid_group(group) elif to_whom.startswith('special:'): valid = True else: pass if not valid: msg = 'Invalid to_whom spec: %s' % to_whom logger.error(msg)
def get_id2element(soup, att): id2element = {} duplicates = set() # ignore the maths ignore = set() for element in soup.select('svg [%s]' % att): # node with ID below SVG ignore.add(element[att]) for element in soup.select('svg[%s]' % att): # svg with ID ignore.add(element[att]) for element in soup.select('[%s^="MathJax"]' % att): # stuff created by MathJax ignore.add(element[att]) for element in soup.select('[%s]' % att): ID = element[att] if ID in ignore: continue if ID in id2element: duplicates.add(ID) other = id2element[ID] for e0 in [element, other]: if not 'errored' in e0.attrs.get('class', ''): add_class(e0, 'errored') w = Tag(name='span', attrs={'class':'duplicated-id'}) w.string = 'More than one element with id %r.' % ID e0.insert_after(w) id2element[element[att]] = element if duplicates: s = ", ".join(sorted(duplicates)) msg = '%d duplicated %s found (not errored): %s' % (len(duplicates), att, s) logger.error(msg) return id2element, duplicates
def get_id2filename(filename2contents): ignore_these = [ 'tocdiv', 'not-toc', 'disqus_thread', 'disqus_section', 'dsq-count-scr', 'banner', ] id2filename = {} for filename, contents in filename2contents.items(): for element in contents.findAll(id=True): id_ = element.attrs['id'] if id_ in ignore_these: continue if id_ in id2filename: logger.error('double element with ID %s' % id_) id2filename[id_] = filename # also don't forget the id for the entire section if 'id' in contents.attrs: id_ = contents.attrs['id'] id2filename[id_] = filename return id2filename
def go(self): url = 'http://localhost:8080/repos/bundled/shelves/unittests/libraries/basic/models/minus_r_real3/views/dp_graph/' self.driver.get(url) self.screenshot() self.driver.get('http://localhost:8080/') self.screenshot() self.click_partial_link_text('login') driver = self.driver # fill log in screen e = driver.find_element_by_css_selector('input[name=login]') e.send_keys("andrea") e = driver.find_element_by_css_selector('input[name=password]') e.send_keys("editor") es = driver.find_elements_by_css_selector('button') if len(es) > 1: msg = 'There should not be more than 1 button' logger.error(msg) es[1].click() # go to shelves self.click_partial_link_text('shelves') self.click_partial_link_text('unittests') self.click_partial_link_text('basic') self.click_partial_link_text('minus_real3') self.click_css('button#size-plus') self.click_css('button#size-plus') self.click_css('button#size-plus')
def fix_header_id(header): ID = header.get('id', None) prefix = None if (ID is None or not ':' in ID) else ID[:ID.index(':')] allowed_prefixes_h = { 'h1': ['sec', 'app', 'part'], 'h2': ['sub', 'appsub'], 'h3': ['subsub', 'appsubsub'], 'h4': ['par'], } if header.name in allowed_prefixes_h: allowed_prefixes = allowed_prefixes_h[header.name] default_prefix = allowed_prefixes[0] if ID is None: header['id'] = '%s:%s' % (default_prefix, GlobalCounter.header_id) GlobalCounter.header_id += 1 else: if prefix is None: if ID != 'booktitle': msg = ('Adding prefix %r to current id %r for %s.' % (default_prefix, ID, header.name)) header.insert_before(Comment('Warning: ' + msg)) header['id'] = default_prefix + ':' + ID else: if prefix not in allowed_prefixes: msg = ('The prefix %r is not allowed for %s (ID=%r)' % (prefix, header.name, ID)) logger.error(msg) header.insert_after(Comment('Error: ' + msg))
def replace_macros(s): ''' Replaces strings of the type @@{key} It looks in MCDPManualConstants.macros Also available @@{MCDPConstants.name} ''' macros = MCDPManualConstants.macros class MyTemplate(Template): delimiter = '@@' idpattern = r'[_a-z][\._a-z0-9]*' def _invalid(self, mo): i = mo.start('invalid') lines = self.template[:i].splitlines(True) if not lines: colno = 1 lineno = 1 else: colno = i - len(''.join(lines[:-1])) lineno = len(lines) char = location(lineno - 1, colno - 1, s) w = Where(s, char) raise DPSyntaxError('Invalid placeholder', where=w) class Sub(object): def __init__(self, data): self.data = data def __getitem__(self, key): if key in self.data: return self.data[key] if '.' in key: i = key.index('.') first, last = key[:i], key[i + 1:] #print('%s -> %s, %s' % (key, first, last)) return self[first][last] raise KeyError(key) t = MyTemplate(s) MyTemplate.idpattern = r'[_a-z][\._a-z0-9]*' try: s2 = t.substitute(Sub(macros)) except KeyError as e: key = str(e).replace("'", "") search_for = MyTemplate.delimiter + key logger.error('Could not find key %r' % key) char = s.index(search_for) w = Where(s, char) msg = 'Key %r not found - maybe use braces?' % key raise DPSyntaxError(msg, where=w) return s2
def process(dirname, e): db_view = db_view_from_dirname(dirname) host_cache = HostCache(db_view) e.repo = db_view.repos[e.repo_name] e.shelf = e.repo.shelves[e.shelf_name] e.library = e.shelf.libraries[e.library_name] e.things = e.library.things.child(e.spec_name) subscribed_shelves = get_all_shelves(db_view) e.context = TheContext(host_cache, db_view, subscribed_shelves, e.library_name) e.mcdp_library = e.context.get_library() source = e.things[e.thing_name] t0 = time.clock() try: context = e.context.child() e.mcdp_library.load_spec(e.spec_name, e.thing_name, context=context) error = None error_string = None exc = None except MCDPException as exc: error = type(exc).__name__ error_string = str(exc) finally: cpu = time.clock() - t0 if gives_syntax_error(source): if isinstance(exc, DPSyntaxError): error = None error_string = None else: error = 'Unexpected' error_string = 'Expected DPSyntaxError error, got %s' % type(exc).__name__ error_string += '\n' + indent(error_string, 'obtained > ') elif gives_semantic_error(source): if isinstance(exc, DPSemanticError): error = None error_string = None else: error = 'Unexpected' error_string = 'Expected DPSemanticError error, got %s' % type(exc).__name__ error_string += '\n' + indent(error_string, 'obtained > ') elif gives_not_implemented_error(source): if isinstance(exc, DPNotImplementedError): error = None error_string = None else: error = 'Unexpected' error_string = 'Expected DPNotImplementedError error, got %s' % type(exc).__name__ error_string += '\n' + indent(error_string, 'obtained > ') if error: logger.error(e.id + ' ' + error) return Result(error_type=error, error_string=error_string, cpu=cpu, warnings=0)
def rmtree_only_contents(d): """ Removes all the contents but not the directory itself. """ for the_file in os.listdir(d): file_path = os.path.join(d, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: logger.error(e)
def add_prev_next_links(filename2contents, only_for=None): new_one = OrderedDict() for filename, contents in list(filename2contents.items()): if only_for and not filename in only_for: continue id_prev = contents.attrs[ATTR_PREV] a_prev = Tag(name='a') a_prev.attrs['href'] = '#' + str(id_prev) a_prev.attrs['class'] = CLASS_LINK_PREV a_prev.append('prev') id_next = contents.attrs[ATTR_NEXT] a_next = Tag(name='a') a_next.attrs['href'] = '#' + str(id_next) a_next.attrs['class'] = CLASS_LINK_NEXT a_next.append('next') S = Tag(name='div') S.attrs['class'] = ['super'] nav1 = Tag(name='div') add_class(nav1, 'navigation') if id_prev: nav1.append(a_prev.__copy__()) if id_next: nav1.append(a_next.__copy__()) spacer = Tag(name='div') spacer.attrs['style'] = 'clear:both' nav1.append(spacer) add_class(contents, 'main-section-for-page') contents2 = contents S.append(contents2) from .source_info_imp import get_main_header actual_id = get_main_header(contents2) if False: # just checking e = contents2.find(id=actual_id) if e is not None: pass else: logger.error('not found %r' % actual_id) S.attrs['id'] = actual_id contents2.insert(0, nav1.__copy__()) contents2.append(nav1.__copy__()) new_one[filename] = S return new_one
def warn_for_duplicated_ids(soup): from collections import defaultdict counts = defaultdict(lambda: []) for e in soup.select('[id]'): ID = e['id'] counts[ID].append(e) problematic = [] for ID, elements in counts.items(): n = len(elements) if n == 1: continue ignore_if_contains = [ 'MathJax', # 'MJ', 'edge', 'mjx-eqn', ] if any(_ in ID for _ in ignore_if_contains): continue inside_svg = False for e in elements: for _ in e.parents: if _.name == 'svg': inside_svg = True break if inside_svg: continue #msg = ('ID %15s: found %s - numbering will be screwed up' % (ID, n)) # logger.error(msg) problematic.append(ID) for e in elements: t = Tag(name='span') t['class'] = 'duplicated-id' t.string = 'Error: warn_for_duplicated_ids: There are %d tags with ID %s' % ( n, ID) # e.insert_before(t) add_class(e, 'errored') for i, e in enumerate(elements[1:]): e['id'] = e['id'] + '-duplicate-%d' % (i + 1) #print('changing ID to %r' % e['id']) if problematic: logger.error('The following IDs were duplicated: %s' % ", ".join(problematic)) logger.error( 'I renamed some of them; references and numbering are screwed up')
def get_empty_links_to_fragment(element_to_modify, extra_refs, res): """ Find all empty links that have a reference to a fragment. yield LinkElement """ # logger.debug('building index') # first find all elements by id id2element_local, duplicates = get_id2element(element_to_modify, 'id') id2element_extra, duplicates2 = get_id2element(extra_refs, 'id') for k in id2element_extra: if k in id2element_local: if 'ignore_if_conflict' in id2element_extra[k].attrs: continue msg = 'ID %s in cross references also contained locally.' % k def cut(x): if len(x) < 500: return x else: return x[:500] + ' ... ' msg += '\n\n' + indent(cut(id2element_local[k]), '', 'local: ') msg += '\n\n' + indent(cut(id2element_extra[k]), '', 'crossrefs: ') res.note_error(msg, HTMLIDLocation.for_element(id2element_local[k])) logger.error(msg) id2element = {} id2element.update(id2element_extra) id2element.update(id2element_local) # logger.debug('building index done') for element in get_empty_links(element_to_modify): if not 'href' in element.attrs: continue href = element.attrs['href'] if not href.startswith('#'): continue rest = href[1:] eid = rest query = None linked = id2element.get(eid, None) # noinspection PyArgumentList yield LinkElement(linker=element, eid=eid, linked=linked, query=query)
def update_refs_(filename, contents, id2filename): test_href = lambda _: _ is not None and _.startswith('#') elements = list(contents.find_all('a', attrs={'href': test_href})) # logger.debug('updates: %s' % sorted(id2filename)) for a in elements: href = a.attrs['href'] assert href[0] == '#' id_ = href[1:] if id_ in id2filename: point_to_filename = id2filename[id_] if point_to_filename != filename: new_href = '%s#%s' % (point_to_filename, id_) a.attrs['href'] = new_href add_class(a, 'link-different-file') else: # actually it doesn't change new_href = '#%s' % id_ a.attrs['href'] = new_href add_class(a, 'link-same-file') if 'toc_link' in a.attrs['class']: p = a.parent assert p.name == 'li' add_class(p, 'link-same-file-direct-parent') # now find all the lis for x in list(p.descendants): if isinstance(x, Tag) and x.name == 'li': add_class(x, 'link-same-file-inside') p = a.parent while p: if isinstance(p, Tag) and p.name in ['ul', 'li']: add_class(p, 'contains-link-same-file') p = p.parent else: logger.error('update_ref() for %r: no element with ID "%s".' % (filename, id_))
def get_id2filename(filename2contents): ignore_these = [ 'tocdiv', 'not-toc', 'disqus_thread', 'disqus_section', 'dsq-count-scr', 'banner', 'MathJax_SVG_glyphs', 'MathJax_SVG_styles', ] id2filename = {} for filename, contents in filename2contents.items(): for element in contents.select('[id]'): if can_ignore_duplicated_id(element): continue id_ = element.attrs['id'] if id_ in ignore_these: continue if id_ in id2filename: logger.error('double element with ID %s' % id_) # logger.error(str(element.parent())) id2filename[id_] = filename # also don't forget the id for the entire section if 'id' in contents.attrs: id_ = contents.attrs['id'] id2filename[id_] = filename return id2filename
def get_id2element(soup, att): id2element = OrderedDict() duplicates = set() # ignore the maths ignore = set() for element in soup.select('svg [%s]' % att): # node with ID below SVG ignore.add(element[att]) for element in soup.select('svg[%s]' % att): # svg with ID ignore.add(element[att]) for element in soup.select('[%s^="MathJax"]' % att): # stuff created by MathJax ignore.add(element[att]) for element in soup.select('[%s]' % att): ID = element[att] if ID in ignore: continue if ID in id2element: duplicates.add(ID) if False: other = id2element[ID] for e0 in [element, other]: # note_error2(e0, 'Naming', 'More than one element with id %r.' % ID) msg = 'More than one element with id %r.' % ID res.note_error(msg, HTMLIDLocation.before_element(e0)) id2element[element[att]] = element if duplicates: n = len(duplicates) if n > 100: duplicates = list(duplicates)[:100] s = ", ".join(sorted(duplicates)) msg = '%d duplicated %s found: %s' % (n, att, s) logger.error(msg) return id2element, duplicates
def check_various_errors(d): error_names = ['DPSemanticError', 'DPSyntaxError'] selector = ", ".join('.' + _ for _ in error_names) errors = list(d.find_all(selector)) if errors: msg = 'I found %d errors in processing.' % len(errors) logger.error(msg) for e in errors: logger.error(e.contents) fragments = list(d.find_all('fragment')) if fragments: msg = 'There are %d spurious elements "fragment".' % len(fragments) logger.error(msg)
def generate_view_syntax(e, make_relative): expr = e.spec.parse_expr parse_refine = e.spec.parse_refine source_code = e.thing context = Context() class Tmp: refined = None def postprocess(block): if parse_refine is None: return block try: Tmp.refined = parse_refine(block, context) return Tmp.refined except DPSemanticError: return block try: highlight = ast_to_html(source_code, add_line_gutter=False, parse_expr=expr, postprocess=postprocess) def get_link_library(libname): try: rname, sname = e.session.get_repo_shelf_for_libname(libname) except NoSuchLibrary: raise url0 = "/repos/%s/shelves/%s/libraries/%s/" % (rname, sname, libname) return make_relative(url0) def get_link(specname, libname, thingname): # find library. Returns a string or raises error try: rname, sname = e.session.get_repo_shelf_for_libname(libname) except NoSuchLibrary: msg = 'No such library %r' % libname logger.debug(msg) raise # return None things = e.db_view.repos[rname].shelves[sname].libraries[ libname].things.child(specname) if thingname in things: # check if the thing exists res = get_link_library( libname) + '%s/%s/views/syntax/' % (specname, thingname) # logger.debug(' link for %s = %s' % (thingname, res)) return res else: msg = 'No such thing %r' % thingname logger.debug(msg) raise NoSuchLibrary(msg) highlight = add_html_links(highlight, e.library_name, get_link, get_link_library) parses = True error = '' except (DPSyntaxError, DPNotImplementedError) as exc: highlight = '<pre class="source_code_with_error">%s</pre>' % source_code error = exc.__str__() parses = False if parses: mcdp_library = library_from_env(e) image_source = image_source_from_env(e) try: thing = e.spec.load(mcdp_library, e.thing_name, context=context) svg_data = get_svg_for_visualization(e, image_source, e.library_name, e.spec, e.thing_name, thing, Tmp.refined, make_relative, library=mcdp_library) except (DPSemanticError, DPNotImplementedError) as exc: logger.error(exc) from mcdp_web.editor_fancy.app_editor_fancy_generic import html_mark if exc.where.string != source_code: msg = 'This exception refers to another file.' msg += '\n source_code: %r' % source_code msg += '\n exception.where.string: %r' % exc.where.string msg += '\n' + indent(traceback.format_exc(exc), 'exc > ') raise DPInternalError(msg) try: highlight = html_mark(highlight, exc.where, "semantic_error") except NoLocationFound as e: msg = 'While trying to annotate the exception:' msg += '\n' + indent(exc, 'exc > ') raise_wrapped(NoLocationFound, e, msg) error = exc.error + "\n" + format_where(exc.where) svg_data = None else: svg_data = None check_isinstance(highlight, str) res = { 'source_code': source_code, 'error': unicode(error, 'utf-8'), 'highlight': unicode(highlight, 'utf-8'), # 'realpath': realpath, 'current_view': 'syntax', 'explanation1_html': None, 'explanation2_html': None, 'svg_data': unicode(svg_data, 'utf-8') if svg_data is not None else None, 'parses': parses, # whether it parses } return res
def check_if_any_href_is_invalid(soup): ''' Checks if references are invalid and tries to correct them. if it is of the form "#frag?query" then query is stripped out ''' errors = [] math_errors = [] # let's first find all the IDs id2element, duplicates = get_id2element(soup, 'id') _name2element, _duplicates = get_id2element(soup, 'name') # id2element.update(name2element) # for a in soup.select('a[href^="#"]'): for a in soup.select('[href^="#"]'): href = a['href'] if a.has_attr('class') and "mjx-svg-href" in a['class']: msg = 'Invalid math reference (sorry, no details): href = %s .' % href logger.error(msg) a.insert_before(Comment('Error: %s' % msg)) math_errors.append(msg) continue assert href.startswith('#') ID = href[1:] # remove query if it exists if '?' in ID: ID = ID[:ID.index('?')] # not_found = [] if not ID in id2element: # try to fix it # # # it there is named element # if ID in name2element: # real_id = name2element[ID].attrs # if there is already a prefix, remove it if ':' in href: i = href.index(':') core = href[i+1:] else: core = ID possible = ['sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub', 'appsubsub', 'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm' ] matches = [] others = [] for possible_prefix in possible: why_not = possible_prefix + ':' + core others.append(why_not) if why_not in id2element: matches.append(why_not) if len(matches) > 1: msg = '%s not found, and multiple matches for heuristics (%s)' % (href, matches) logger.error(msg) add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'}) w.string = msg a.insert_after(w) elif len(matches) == 1: msg = '%s not found, but corrected in %s' % (href, matches[0]) logger.debug(msg) add_class(a, 'warning') w = Tag(name='span', attrs={'class':'href-replaced'}) w.string = msg a['href'] = '#' + matches[0] a.insert_after(w) else: # msg = 'Not found %r (also tried %s)' % (href, ", ".join(others)) # not_found.append(ID) # logger.error(msg) errors.append('Not found %r' % (href)) if not 'errored' in a.attrs.get('class', ''): add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'}) w.string = 'Not found %r' % (href) a.insert_after(w) if ID in duplicates: msg = 'More than one element matching %r.' % href logger.error(msg) if not 'errored' in a.attrs.get('class', ''): add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-multiple'}) w.string = msg a.insert_after(w) errors.append(msg) return errors, math_errors
def manual_join(template, files_contents, bibfile, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None): """ extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ logger.debug('remove_selectors: %s' % remove_selectors) logger.debug('remove: %s' % remove) from mcdp_utils_xml import bs template = replace_macros(template) # cannot use bs because entire document template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup assert d.html is not None assert '<html' in str(d) head = d.find('head') assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) basename2soup = OrderedDict() for (_libname, docname), data in files_contents: frag = bs(data) basename2soup[docname] = frag fix_duplicated_ids(basename2soup) body = d.find('body') add_comments = False for docname, content in basename2soup.items(): logger.debug('docname %r -> %s KB' % (docname, len(data) / 1024)) from mcdp_docs.latex.latex_preprocess import assert_not_inside assert_not_inside(data, 'DOCTYPE') if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) for x in content: x2 = x.__copy__() # not clone, not extract body.append(x2) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) extract_bibtex_blocks(d) logger.info('external bib') if bibfile is not None: if not os.path.exists(bibfile): logger.error('Cannot find bib file %s' % bibfile) else: bibliography_entries = get_bibliography(bibfile) bibliography_entries['id'] = 'bibliography_entries' body.append(bibliography_entries) bibhere = d.find('div', id='put-bibliography-here') if bibhere is None: logger.warning('Could not find #put-bibliography-here in document.' 'Adding one at end of document') bibhere = Tag(name='div') bibhere.attrs['id'] = 'put-bibliography-here' d.find('body').append(bibhere) do_bib(d, bibhere) if True: logger.info('reorganizing contents in <sections>') body2 = reorganize_contents(d.find('body')) body.replace_with(body2) else: warnings.warn('fix') body2 = body # Removing all_selectors = [] if remove is not None and remove != '': all_selectors.append(remove) if remove_selectors: all_selectors.extend(remove_selectors) logger.debug('all_selectors: %s' % all_selectors) all_removed = '' for selector in all_selectors: nremoved = 0 logger.debug('Removing selector %r' % remove) toremove = list(body2.select(selector)) logger.debug('Removing %d objects' % len(toremove)) for x in toremove: nremoved += 1 nd = len(list(x.descendants)) logger.debug('removing %s with %s descendants' % (x.name, nd)) if nd > 1000: s = str(x)[:300] logger.debug(' it is %s' % s) x.extract() all_removed += '\n\n' + '-' * 50 + ' chunk %d removed\n' % nremoved all_removed += str(x) all_removed += '\n\n' + '-' * 100 + '\n\n' logger.info('Removed %d elements of selector %r' % (nremoved, remove)) # if False: with open('all_removed.html', 'w') as f: f.write(all_removed) if hook_before_toc is not None: hook_before_toc(soup=d) ### logger.info('adding toc') toc = generate_toc(body2) logger.info('TOC:\n' + str(toc)) toc_ul = bs(toc).ul toc_ul.extract() assert toc_ul.name == 'ul' toc_ul['class'] = 'toc' toc_ul['id'] = 'main_toc' toc_selector = 'div#toc' tocs = list(d.select(toc_selector)) if not tocs: msg = 'Cannot find any element of type %r to put TOC inside.' % toc_selector logger.warning(msg) else: toc_place = tocs[0] toc_place.replaceWith(toc_ul) logger.info('checking errors') check_various_errors(d) from mcdp_docs.check_missing_links import check_if_any_href_is_invalid logger.info('checking hrefs') check_if_any_href_is_invalid(d) # Note that this should be done *after* check_if_any_href_is_invalid() # because that one might fix some references logger.info('substituting empty links') substituting_empty_links(d) warn_for_duplicated_ids(d) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) add_footnote_polyfill(d) logger.info('converting to string') # do not use to_html_stripping_fragment - this is a complete doc res = unicode(d) res = res.encode('utf8') logger.info('done - %d bytes' % len(res)) return res
def manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None, references=None, resolve_references=True, hook_before_final_pass=None, require_toc_placeholder=False, permalink_prefix=None, crossrefs_aug=None, aug0=None): """ files_contents: a list of tuples that can be cast to DocToJoin: where the string is a unique one to be used for job naming. extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ result = AugmentedResult() if references is None: references = {} check_isinstance(files_contents, list) if crossrefs_aug is None: crossrefs = Tag(name='no-cross-refs') else: crossrefs = bs(crossrefs_aug.get_result()) result.merge(crossrefs_aug) if aug0 is not None: result.merge(aug0) @contextmanager def timeit(_): yield with timeit('manual_join'): files_contents = [DocToJoin(*_) for _ in files_contents] # cannot use bs because entire document with timeit('parsing template'): template0 = template template = replace_macros(template) template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup if d.html is None: s = "Invalid template" raise_desc(ValueError, s, template0=template0) with timeit('adding head'): assert d.html is not None assert '<html' in str(d) head = d.find('head') if head is None: msg = 'Could not find <head> in template:' logger.error(msg) logger.error(str(d)) raise Exception(msg) assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) with timeit('adding stylesheet'): if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) with timeit('making basename2soup'): basename2soup = OrderedDict() for doc_to_join in files_contents: if doc_to_join.docname in basename2soup: msg = 'Repeated docname %r' % doc_to_join.docname raise ValueError(msg) from .latex.latex_preprocess import assert_not_inside if isinstance(doc_to_join.contents, AugmentedResult): result.merge(doc_to_join.contents) contents = doc_to_join.contents.get_result() else: contents = doc_to_join.contents assert_not_inside(contents, '<fragment') assert_not_inside(contents, 'DOCTYPE') frag = bs(contents) basename2soup[doc_to_join.docname] = frag # with timeit('fix_duplicate_ids'): # XXX # fix_duplicated_ids(basename2soup) with timeit('copy contents'): body = d.find('body') add_comments = False for docname, content in basename2soup.items(): if add_comments: body.append(NavigableString('\n\n')) body.append( Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) try_faster = True if try_faster: for e in list(content.children): body.append(e.extract()) else: copy_contents_into(content, body) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) with timeit('extract_bibtex_blocks'): extract_bibtex_blocks(d) with timeit('ID_PUT_BIB_HERE'): ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE bibhere = d.find('div', id=ID_PUT_BIB_HERE) if bibhere is None: msg = ('Could not find #%s in document. ' 'Adding one at end of document.') % ID_PUT_BIB_HERE result.note_warning(msg) bibhere = Tag(name='div') bibhere.attrs['id'] = ID_PUT_BIB_HERE d.find('body').append(bibhere) do_bib(d, bibhere) with timeit('hook_before_final_pass'): if hook_before_final_pass is not None: hook_before_final_pass(soup=d) with timeit('document_final_pass_before_toc'): location = LocationUnknown() document_final_pass_before_toc(d, remove, remove_selectors, result, location) with timeit('hook_before_toc'): if hook_before_toc is not None: hook_before_toc(soup=d) with timeit('generate_and_add_toc'): try: generate_and_add_toc(d, raise_error=True, res=result) except NoTocPlaceholder as e: if require_toc_placeholder: msg = 'Could not find toc placeholder: %s' % e # logger.error(msg) if aug0 is not None: result.note_error(msg) else: raise Exception(msg) with timeit('document_final_pass_after_toc'): document_final_pass_after_toc( soup=d, crossrefs=crossrefs, resolve_references=resolve_references, res=result) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) with timeit('document_only_once'): document_only_once(d) location = LocationUnknown() substitute_github_refs(d, defaults={}, res=result, location=location) with timeit('another A pass'): for a in d.select('a[href]'): href = a.attrs['href'] if href in references: r = references[href] a.attrs['href'] = r.url if not a.children: # empty a.append(r.title) # do not use to_html_stripping_fragment - this is a complete doc # mark_in_html(result, soup=d) add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix) with timeit('converting to string'): res = unicode(d) with timeit('encoding'): res = res.encode('utf8') logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0))) result.set_result(res) return result
def check_no_headers_inside_div(x): if x.name == 'div' and list(x.find_all(['h1', 'h2', 'h3', 'h4', 'h5'])): msg = 'There are headers inside this <div>' logger.error(msg)