def test_text_acquisition_methods(self): # These methods are intended for use against Tag, but they # work on NavigableString as well, s = NavigableString("fee ") cdata = CData("fie ") comment = Comment("foe ") assert "fee " == s.get_text() assert "fee" == s.get_text(strip=True) assert ["fee "] == list(s.strings) assert ["fee"] == list(s.stripped_strings) assert ["fee "] == list(s._all_strings()) assert "fie " == cdata.get_text() assert "fie" == cdata.get_text(strip=True) assert ["fie "] == list(cdata.strings) assert ["fie"] == list(cdata.stripped_strings) assert ["fie "] == list(cdata._all_strings()) # Since a Comment isn't normally considered 'text', # these methods generally do nothing. assert "" == comment.get_text() assert [] == list(comment.strings) assert [] == list(comment.stripped_strings) assert [] == list(comment._all_strings()) # Unless you specifically say that comments are okay. assert "foe" == comment.get_text(strip=True, types=Comment) assert "foe " == comment.get_text(types=(Comment, NavigableString))
def _translate(node: NavigableString): try: node.replace_with( self.translate(str(node), destination_language=dest_lang, source_language=source_lang).result) except Exception: # ignore if it couldn't find any result or an error occured pass
def _write_title(self, file_object, conversation): string = self.fill_pattern(conversation, self.TITLE_PATTERN, self.TIME_FMT_TITLE, untransform=True) string_elem = NavigableString(string) string_elem.setup() # workaround for BeautifulSoup issue formatted_title = string_elem.output_ready() file_object.write(self.TITLE_LINE_FMT % (formatted_title, formatted_title) + '\n')
def add_link_to_ndpname(tag, href): initial, middle, final = break_string(tag.string) tag.string = '' name = middle attrs = {'class': 'link-to-model', 'href': href, 'target': '_blank'} new_tag = Tag(name="a", attrs=attrs) new_tag.string = name tag.append(NavigableString(initial)) tag.append(new_tag) tag.append(NavigableString(final))
def insert_escaped_tags(self, tags): """For each tag in "tags", insert contextual tags (e.g., <p> </p>) as escaped text so that these tags are still there when html markup is stripped out.""" found = False for tag in tags: strs = list(tag.strings) if len(strs) > 0: l = tag.name strs[0].parent.insert(0, NavigableString("<" + l + ">")) strs[-1].parent.append(NavigableString("</" + l + ">")) found = True return found
def _convert_element(self, wrapper_element: HtmlTag, parent: Tag = None) -> Tag: native_element: Tag = Tag(name=wrapper_element.name, attrs=wrapper_element.attributes, parent=parent, previous=NavigableString('\n'), builder=BUILDER) for child in wrapper_element.children: native_element.contents.append( self._convert_element(child, native_element)) if len(wrapper_element.text) > 0: native_element.contents = [NavigableString(wrapper_element.text)] return native_element
def __init__(self, **kwargs): self._alias = '' self._sender = '' self._text = '' self._time = '' self._delayed = False self._alternate = False self._html = [] self._isuser = False for k, v in iter(kwargs.items()): setattr(self, '_' + k, v) self._system = True if kwargs.get('system', None) else False if self._text and not self._html: self._html = [NavigableString(self._text)] for argname in ('alias', 'sender', 'text'): _validate_argument(getattr(self, '_' + argname), argname, basestring) if self._system: self._alias = '' self._sender = '' elif not self._alias and not self._sender: raise ArgumentError('non-system Entry must have sender or alias') elif self._alias == self._sender: self._alias = '' _validate_argument(self._time, 'time', datetime.datetime) _validate_argument(self._html, 'html', list) for e in self._html: _validate_argument(e, 'html', PageElement)
def _parse_status(self, comment, info, conversation): if comment: info['type'], info['system'], info['sender'] = \ comment.split("|", 2) info['type'] = int(info['type']) if info['type'] in Status.USER_TYPES: l = info['html'][0].split(': ', 1) if len(l) == 2: info['msg_html'] = [NavigableString(l[1]) ] + info['html'][1:] info['html'] = [] return s = ''.join( [x.text if isinstance(x, Tag) else x.string for x in info['html']]) info['sender'] = conversation.source \ if s.startswith(_("You")) else None if not info['type']: typemap = dict(self.STATUS_TYPEMAP, **self.CHAT_STATUS_TYPEMAP) if \ conversation.isgroup else self.STATUS_TYPEMAP for pattern, t in iter(typemap.items()): i = util.parse_string(s, pattern) if i is not None: for k, v in iter(i.items()): info[k] = v # special case for 'is no longer <type>' typestr = i.get('type') if typestr: info['type'] = \ Status.OPPOSITES[Status.PAM_EPYT[typestr]] else: info['type'] = t break
def _parse(self, course: NavigableString) -> ParseType: """Parses course to get its link & icon url, title, description counts and stores. :course: BeautifulSoup Object""" info = { "link": "", "icon": "", "title": "", "description": "", "counts": {} } info["link"] = course.a["href"] info["icon"] = course.a.img["src"] description: NavigableString = course.a.div info["title"] = description.div.get_text() info["description"] = description.p.get_text() counts: NavigableString = course.find("div", {"class": "courseCounts"}) counts_data: ResultSet = counts.find_all("li") for data in counts_data: name: str = data.span.get_text().lower() val: str = data.find("p").get_text() info["counts"][name] = int(val.replace(",", "")) return info
def _populate_fields(self, form_element: Tag, field_values: Dict[str, List[str]]) -> None: """ Inserts the provided values into the form input elements inside the exercise element. """ # Find all form elements on the exercise page and fill in the values field_elements = form_element.find_all(['input', 'select', 'textarea']) for field_element in field_elements: field_element = cast(Tag, field_element) field_name = cast(str, field_element.get('name')) if field_name not in field_values: continue if field_element.name == 'input': if field_element.get('type') in ('radio', 'checkbox'): if field_element.get('value') in field_values[field_name]: field_element['checked'] = '' else: del field_element['checked'] else: field_element['value'] = field_values[field_name][0] elif field_element.name == 'select': for option_element in field_element.find_all('option'): option_element = cast(Tag, option_element) if option_element.get('value') in field_values[field_name]: option_element['selected'] = '' else: del option_element['selected'] elif field_element.name == 'textarea': string_content = NavigableString(field_values[field_name][0]) field_element.contents = [string_content]
def recursive_traversal(element, data): if element.name in EXCLUDE_BLOCKS: return block_children = [] non_block = [] non_block_with_attr = [] navigable_strings = [] for child in element.children: if isinstance(child, NavigableString): if str(child) != '' and not child.isspace(): navigable_strings.append(child) elif child.name not in NON_BLOCK_ELEMENTS: block_children.append(child) elif any(child.has_attr(attr) for attr in TEXT_ATTRS): non_block_with_attr.append(child) else: non_block.append(child) if block_children: for child in non_block_with_attr: replace_attrs(child, data) for child in block_children: replace_attrs(child, data) recursive_traversal(child, data) for navigable_str in navigable_strings: string_hash = hashlib.md5(navigable_str.encode()).hexdigest() data[string_hash] = str(navigable_str) navigable_str.replace_with( NavigableString(f"{{{{localize('{string_hash}')}}}}")) for block in non_block: hash_element_content(block, data) else: hash_element_content(element, data)
def recursive_replace(tag): if hasattr(tag, "contents"): for i in range(len(tag.contents)): child = tag.contents[i] if child.name == "code": tag.contents[i] = NavigableString(self.store(str(child))) else: recursive_replace(child)
def make_tag(tag0, klass, data, ndp=None, template=None, poset=None): svg = data['svg'] tag_svg = BeautifulSoup(svg, 'lxml', from_encoding='utf-8').svg assert tag_svg.name == 'svg' if tag_svg.has_attr('width'): ws = tag_svg['width'] hs = tag_svg['height'] assert 'pt' in ws w = float(ws.replace('pt','')) h = float(hs.replace('pt','')) scale = MCDPConstants.scale_svg w2 = w * scale h2 = h * scale tag_svg['width'] = w2 tag_svg['height'] = h2 tag_svg['rescaled'] = 'Rescaled from %s %s, scale = %s' % (ws, hs, scale) else: print('no width in SVG tag: %s' % tag_svg) tag_svg['class'] = klass if tag0.has_attr('style'): tag_svg['style'] = tag0['style'] if tag0.has_attr('id'): tag_svg['id'] = tag0['id'] if generate_pdf: pdf0 = data['pdf'] pdf = crop_pdf(pdf0, margins=0) div = Tag(name='div') att = MCDPConstants.ATTR_LOAD_NAME if tag0.has_attr('id'): basename = tag0['id'] elif ndp is not None and hasattr(ndp, att): basename = getattr(ndp, att) elif template is not None and hasattr(template, att): basename = getattr(template, att) elif poset is not None and hasattr(poset, att): basename = getattr(poset, att) else: hashcode = hashlib.sha224(tag0.string).hexdigest()[-8:] basename = 'code-%s' % (hashcode) docname = os.path.splitext(os.path.basename(realpath))[0] download = docname + "." + basename + "." + klass + '.pdf' a = create_a_to_data(download=download, data_format='pdf', data=pdf) a['class'] = 'pdf_data' a.append(NavigableString(download)) div.append(tag_svg) div.append(a) return div else: return tag_svg
def html(self): if not self._html: self._html = [] if self.type in self.USER_TYPES: s = self.STATUS_STRING_FMT % \ (self.alias if self.alias else self.sender, self.typestr, ': ' if self.msg_html else '') self._html.append(NavigableString(s)) self._html.extend(self.msg_html) else: self._html.append(self.typestr) return self._html
def _parse_line(self, line, conversation, source, transformed_source): """Return (cons, attrs)""" status_html = [] attrs = {} cons = None for elem in BeautifulSoup(line, ['lxml', 'xml']).children: if isinstance(elem, Comment): alternate, status_html = elem.split('|', 1) attrs['alternate'] = True if alternate else False status_html = [NavigableString(status_html)] continue for key in ('alias', 'sender', 'auto', 'time'): attrs[key] = elem.get(key, '') if attrs['sender'] == source: attrs['sender'] = transformed_source attrs['isuser'] = True else: attrs['isuser'] = False attrs['auto'] = bool(attrs['auto']) if attrs['time']: fmt = self.STRPTIME_FMT_CONVERSATION attrs['time'] = self._parse_time(attrs['time'], fmt) attrs['html'] = list(elem.children) if elem.name == 'status': cons = Status attrs['type'] = self.STATUS_TYPEMAP.get(elem.get('type'), None) if attrs['type'] in Status.USER_TYPES: attrs['msg_html'] = attrs['html'] attrs['html'] = status_html elif elem.name == 'event': cons = Event attrs['type'] = self.EVENT_TYPEMAP.get(elem.get('type'), None) elif elem.name == 'message': cons = Message else: raise TypeError("unknown type '%s' for entry" % elem.name) if not attrs['sender'] and not attrs['alias']: print_d("%s is a system entry" % elem) attrs['system'] = True if not cons: raise (ParseError("could not parse line: '%s'" % line)) return cons, attrs
def _parse_details(self, code: NavigableString) -> ParseType: """Parses a codeContainer and extracts all the info.""" # Format of details -> # {votes: 1184, answers: 24077, post_link: <PostLink> title: <Title>, # tags: [<Tags>, ...], author_name: <AuthorName>, author_link: <Link>, # data_date: <DateTime>, avatar_link: <Link>} details: ParseType = {} post_stats: NavigableString = code.find("div", {"class": "postStats"}) post_stats_children = list(post_stats.children) details["votes"] = post_stats_children[1].p.string # Note the spelling of <a class='postAnsewers'. details["answers"] = post_stats_children[3].p.string post_details: NavigableString = code.find("div", {"class": "postDetails"}) details["post_link"] = post_details.p.a["href"] details["title"] = post_details.p.a.string tags_wrapper: NavigableString = list(post_details.children)[3] tags: ResultSet = tags_wrapper.find_all("span") tag_list: List[str] = [] for tag in tags: tag_list.append(tag.string) details["tags"] = tag_list author_details: NavigableString = code.find("div", {"class": "authorDetails"}) details["author_name"] = author_details.div.a.string details["author_link"] = author_details.div.a["href"] details["data_date"] = author_details.p["data-date"] details["avatar_link"] = list(author_details.children)[3].img["src"] return details
def get_bibliography(bibfile): data = open(bibfile).read() frag = bs(data) res = Tag(name='div') ids = [] for dt in frag.select('dt'): assert dt.name == 'dt' name = dt.a.attrs['name'] name = 'bib:' + name ids.append(name) dd = dt.findNext('dd') assert dd.name == 'dd' entry = dd.__copy__() entry.name = 'cite' entry.attrs['id'] = name try_to_replace_stuff = False if try_to_replace_stuff: for x in entry.descendants: #print('child', x) if isinstance(x, NavigableString): s = x.string.encode('utf-8') s = s.replace('\n', ' ') s = s.replace('[', '') s = s.replace('|', '') s = s.replace(']', '') y = NavigableString(unicode(s, 'utf-8')) x.replace_with(y) #print('string %r' % x.string) if isinstance(x, Tag) and x.name == 'a' and x.string == 'bib': x.extract() res.append(NavigableString('\n')) res.append(entry) res.append(NavigableString('\n')) print('Found %d bib entries.' % len(ids)) return res
def __init__(self, **kwargs): self._msg_text = '' self._msg_html = [] atype = kwargs.get('type', None) if atype < self._MIN or atype > self._MAX: raise TypeError("unknown type %r for status" % atype) self._type = atype if self._msg_text and not self._msg_html: self._msg_html = [NavigableString(self._msg_text)] if self._type in self.SYSTEM_STATUSES: kwargs['system'] = True super(Status, self).__init__(**kwargs) self._has_other_html = True if self._html else False
def _addEndDot(self, node, soup): if not node.contents: return last_content = node.contents[-1] is_navigable = isinstance(last_content, NavigableString) text = last_content if is_navigable else last_content.get_text( separator=' ', strip=True, types=[NavigableString]) text = text.strip(' .:;)\n\r') + '. ' if is_navigable: node.contents[-1].replace_with(NavigableString(text)) #print node.contents[-1] else: last_content.string = text
def substitute_task_marker_p(p, sub, klass): try: for element in p.descendants: if not isinstance(element, NavigableString): continue s = element.string if sub in s: add_class(p, klass) s2 = s.replace(sub, '') ns = NavigableString(s2) element.replaceWith(ns) except AttributeError as e: # a bug with bs4 msg = 'Bug with descendants: %s' % e logger.debug(msg) pass
def insertBefore(self, node, refNode): index = self._nodeIndex(node, refNode) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index - 1].__class__ == NavigableString): # (See comments in appendChild) newStr = NavigableString(self.element.contents[index - 1] + node.element) oldNode = self.element.contents[index - 1] del self.element.contents[index - 1] oldNode.parent = None oldNode.extract() self.element.insert(index - 1, newStr) else: self.element.insert(index, node.element) node.parent = self
def modify_html(html, request): soup = BeautifulSoup(html, 'html5lib') pattern = r'\b(\w{6})\b' # find all text inside tags def has_content(tag): return any( bool(re.search(pattern, content)) for content in tag.contents if isinstance(content, NavigableString)) # modify each text instance inside tags for tag in soup.body.find_all(has_content): for content in tag.contents: if isinstance(content, NavigableString): new_content = NavigableString( re.sub(pattern, r'\1™', content)) content.replace_with(new_content) return str(soup)
def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") newStr = NavigableString(self.element.contents[-1] + node.element) # Remove the old text node # (Can't simply use .extract() by itself, because it fails if # an equal text node exists within the parent node) oldElement = self.element.contents[-1] del self.element.contents[-1] oldElement.parent = None oldElement.extract() self.element.insert(len(self.element.contents), newStr) else: self.element.insert(len(self.element.contents), node.element) node.parent = self
def substitute_special_paragraph(soup, prefix, klass): """ Looks for paragraphs that start with a simple string with the given prefix. From: <p>prefix contents</p> Creates: <div class='klass-wrap'><p class='klass'>contents</p></div> """ ps = list(soup.select('p')) for p in ps: # Get first child contents = list(p.contents) if not contents: continue c = contents[0] if not isinstance(c, NavigableString): continue s = c.string starts = s.lower().startswith(prefix.lower()) if not starts: continue without = s[len(prefix):] ns = NavigableString(without) c.replaceWith(ns) div = Tag(name='div') add_class(div, klass + '-wrap') add_class(p, klass) parent = p.parent i = parent.index(p) p.extract() div.append(p) parent.insert(i, div)
def substituting_empty_links(soup, raise_errors=False): ''' default style is [](#sec:systems) "Chapter 10" the name is [](#sec:systems?only_name) "My title" the number is [](#sec:systems?only_number) "10" and full is [](#sec:systems?toc_link) "Chapter 10 - My title" You can also use "class": <a href='#sec:name' class='only_number'></a> or <a href='#sec:name?only_number'></a> ''' CLASS_ONLY_NUMBER = MCDPManualConstants.CLASS_ONLY_NUMBER CLASS_NUMBER_NAME = MCDPManualConstants.CLASS_NUMBER_NAME CLASS_ONLY_NAME = MCDPManualConstants.CLASS_ONLY_NAME logger.debug('substituting_empty_links') n = 0 nerrors = 0 for le in get_empty_links_to_fragment(soup): a = le.linker element_id = le.eid element = le.linked n += 1 if not element: msg = ('Cannot find %s' % element_id) note_error_msg(a, msg) nerrors += 1 if raise_errors: raise ValueError(msg) continue # if there is a query, remove it if le.query is not None: new_href = '#' + le.eid a.attrs['href'] = new_href logger.info('setting new href= %s' % (new_href)) if (not LABEL_WHAT_NUMBER in element.attrs) or \ (not LABEL_NAME in element.attrs): msg = ( 'substituting_empty_links: Could not find attributes %s or %s in %s' % (LABEL_NAME, LABEL_WHAT_NUMBER, element)) if True: logger.warning(msg) else: note_error_msg(a, msg) nerrors += 1 if raise_errors: raise ValueError(msg) continue label_what_number = element.attrs[LABEL_WHAT_NUMBER] label_number = element.attrs[LABEL_NUMBER] label_what = element.attrs[LABEL_WHAT] label_name = element.attrs[LABEL_NAME] classes = list(a.attrs.get('class', [])) # bug: I was modifying if le.query is not None: classes.append(le.query) if 'toc_link' in classes: s = Tag(name='span') s.string = label_what add_class(s, 'toc_what') a.append(s) a.append(' ') s = Tag(name='span') s.string = label_number add_class(s, 'toc_number') a.append(s) s = Tag(name='span') s.string = ' - ' add_class(s, 'toc_sep') a.append(s) if label_name is not None and '<' in label_name: contents = bs(label_name) # sanitize the label name for br in contents.findAll('br'): br.replaceWith(NavigableString(' ')) for _ in contents.findAll('a'): _.extract() a.append(contents) #logger.debug('From label_name = %r to a = %r' % (label_name, a)) else: s = Tag(name='span') if label_name is None: s.string = '(unnamed)' # XXX else: s.string = label_name add_class(s, 'toc_name') a.append(s) else: if CLASS_ONLY_NUMBER in classes: label = label_number elif CLASS_NUMBER_NAME in classes: if label_name is None: label = label_what_number + \ ' - ' + '(unnamed)' # warning else: label = label_what_number + ' - ' + label_name elif CLASS_ONLY_NAME in classes: if label_name is None: label = '(unnamed)' # warning else: label = label_name else: label = label_what_number span1 = Tag(name='span') add_class(span1, 'reflabel') span1.string = label a.append(span1) logger.debug('substituting_empty_links: %d total, %d errors' % (n, nerrors))
def unwrap_ul(li: element.NavigableString) -> WebToonChapter: link = li.find('a')['href'] pretty_name = li.find('img')['alt'] chapter = WebToonChapter.from_url(link, pretty_name) return chapter
def do_bib(soup, bibhere): """ find used bibliography entries put them there """ used = [] unused = set() for a in soup.find_all('a'): href = a.attrs.get('href', '') if href.startswith('#bib:'): used.append(href[1:]) # no "#" logger.debug('I found %d references, to these: %s' % (len(used), used)) # collect all the <cite> id2cite = {} for c in soup.find_all('cite'): ID = c.attrs.get('id', None) id2cite[ID] = c if ID in used: add_class(c, 'used') else: unused.add(ID) add_class(c, 'unused') # divide in found and not found found = [] notfound = [] for ID in used: if not ID in id2cite: if not ID in notfound: notfound.append(ID) else: found.append(ID) # now create additional <cite> for the ones that are not found for ID in notfound: cite = Tag(name='cite') s = 'Reference %s not found.' % ID cite.append(NavigableString(s)) cite.attrs['class'] = ['errored', 'error'] # XXX soup.append(cite) id2cite[ID] = cite # now number the cites n = 1 id2number = {} for ID in used: if not ID in id2number: id2number[ID] = n n += 1 # now add the attributes for cross-referencing for ID in used: number = id2number[ID] cite = id2cite[ID] cite.attrs[LABEL_NAME] = '[%s]' % number cite.attrs[LABEL_SELF] = '[%s]' % number cite.attrs[LABEL_NUMBER] = number cite.attrs[LABEL_WHAT] = 'Reference' cite.attrs[LABEL_WHAT_NUMBER_NAME] = '[%s]' % number cite.attrs[LABEL_WHAT_NUMBER] = '[%s]' % number # now put the cites at the end of the document for ID in used: c = id2cite[ID] # remove it from parent c.extract() # logger.debug('Extracting cite for %r: %s' % (ID, c)) # add to bibliography bibhere.append(c) s = ("Bib cites: %d\nBib used: %s\nfound: %s\nnot found: %s\nunused: %d" % (len(id2cite), len(used), len(found), len(notfound), len(unused))) logger.info(s)
def get_minimal_document(body_contents, title=None, add_markdown_css=False, add_manual_css=False, stylesheet=None, extra_css=None): """ Creates the minimal html document with MCDPL css. add_markdown_css: language + markdown add_manual_css: language + markdown + (manual*) extra_css = additional CSS contents """ check_html_fragment(body_contents) soup = bs("") assert soup.name == 'fragment' if title is None: title = '' html = Tag(name='html') head = Tag(name='head') body = Tag(name='body') head.append(Tag(name='meta', attrs={'http-equiv':"Content-Type", 'content': "application/xhtml+xml; charset=utf-8"})) if stylesheet is None: stylesheet = 'v_mcdp_render_default' if add_markdown_css or add_manual_css: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) tag_title = Tag(name='title') tag_title.append(NavigableString(title)) head.append(tag_title) parsed = bs(body_contents) assert parsed.name == 'fragment' parsed.name = 'div' body.append(parsed) html.append(head) html.append(body) soup.append(html) if extra_css is not None: add_extra_css(soup, extra_css) s = to_html_stripping_fragment_document(soup) assert not 'DOCTYPE' in s # s = html.prettify() # no: it removes empty text nodes # ns="""<?xml version="1.0" encoding="utf-8" ?>""" ns = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" "http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd">""" res = ns + '\n' + s # if add_manual_css and MCDPConstants.manual_link_css_instead_of_including: # assert 'manual.css' in res, res res = res.replace('<div><!DOCTYPE html>', '<div>') return res
def mark_console_pres_highlight(soup, res, location): for code in soup.select('pre code'): pre = code.parent if code.string is None: continue s0 = code.string from HTMLParser import HTMLParser h = HTMLParser() s = h.unescape(s0) if s != s0: # print('decoded %r -> %r' % (s0, s)) pass beg = s.strip() # is it a console line? ct = is_console_line(beg) if ct is None: continue add_class(pre, 'console') # add class "on-hostname" if ct.hostname is not None: cn = 'on-%s' % str(ct.hostname) add_class(pre, cn) code.string = '' lines = s.split('\n') def is_program(x, l): if x == 'git' and 'apt' in l: return False return x in programs for j, line in enumerate(lines): tokens = line.split(' ') for i, token in enumerate(tokens): previous_is_sudo_or_dollar = i >= 1 and tokens[i - 1] in [ '$', 'sudo' ] if token in ['$', 'DOLLAR']: # add <span class=console_sign>$</span> e = Tag(name='span') e['class'] = 'console_sign' e.string = '$' code.append(e) elif i == 0 and token == ct.hostname: # it's the hostname e = Tag(name='span') e['class'] = 'hostname' e.string = token code.append(e) elif is_program(token, line) and previous_is_sudo_or_dollar: e = Tag(name='span') e['class'] = '%s program' % token e.string = token code.append(e) elif token in program_commands: e = Tag(name='span') e['class'] = '%s program_command' % token e.string = token code.append(e) elif token and token[0] == '-': e = Tag(name='span') e['class'] = 'program_option' e.string = token code.append(e) else: code.append(NavigableString(token)) is_last = i == len(tokens) - 1 if not is_last: before = '![' in ' '.join(tokens[:i + 1]) if not before: # XXX: this is a bug space = Tag(name='span') space.append(' ') space['class'] = 'space' code.append(space) else: code.append(' ') is_last_line = j == len(lines) - 1 if not is_last_line: code.append(NavigableString('\n'))
def manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None, references=None, resolve_references=True, hook_before_final_pass=None, require_toc_placeholder=False, permalink_prefix=None, crossrefs_aug=None, aug0=None): """ files_contents: a list of tuples that can be cast to DocToJoin: where the string is a unique one to be used for job naming. extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ result = AugmentedResult() if references is None: references = {} check_isinstance(files_contents, list) if crossrefs_aug is None: crossrefs = Tag(name='no-cross-refs') else: crossrefs = bs(crossrefs_aug.get_result()) result.merge(crossrefs_aug) if aug0 is not None: result.merge(aug0) @contextmanager def timeit(_): yield with timeit('manual_join'): files_contents = [DocToJoin(*_) for _ in files_contents] # cannot use bs because entire document with timeit('parsing template'): template0 = template template = replace_macros(template) template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup if d.html is None: s = "Invalid template" raise_desc(ValueError, s, template0=template0) with timeit('adding head'): assert d.html is not None assert '<html' in str(d) head = d.find('head') if head is None: msg = 'Could not find <head> in template:' logger.error(msg) logger.error(str(d)) raise Exception(msg) assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) with timeit('adding stylesheet'): if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) with timeit('making basename2soup'): basename2soup = OrderedDict() for doc_to_join in files_contents: if doc_to_join.docname in basename2soup: msg = 'Repeated docname %r' % doc_to_join.docname raise ValueError(msg) from .latex.latex_preprocess import assert_not_inside if isinstance(doc_to_join.contents, AugmentedResult): result.merge(doc_to_join.contents) contents = doc_to_join.contents.get_result() else: contents = doc_to_join.contents assert_not_inside(contents, '<fragment') assert_not_inside(contents, 'DOCTYPE') frag = bs(contents) basename2soup[doc_to_join.docname] = frag # with timeit('fix_duplicate_ids'): # XXX # fix_duplicated_ids(basename2soup) with timeit('copy contents'): body = d.find('body') add_comments = False for docname, content in basename2soup.items(): if add_comments: body.append(NavigableString('\n\n')) body.append( Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) try_faster = True if try_faster: for e in list(content.children): body.append(e.extract()) else: copy_contents_into(content, body) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) with timeit('extract_bibtex_blocks'): extract_bibtex_blocks(d) with timeit('ID_PUT_BIB_HERE'): ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE bibhere = d.find('div', id=ID_PUT_BIB_HERE) if bibhere is None: msg = ('Could not find #%s in document. ' 'Adding one at end of document.') % ID_PUT_BIB_HERE result.note_warning(msg) bibhere = Tag(name='div') bibhere.attrs['id'] = ID_PUT_BIB_HERE d.find('body').append(bibhere) do_bib(d, bibhere) with timeit('hook_before_final_pass'): if hook_before_final_pass is not None: hook_before_final_pass(soup=d) with timeit('document_final_pass_before_toc'): location = LocationUnknown() document_final_pass_before_toc(d, remove, remove_selectors, result, location) with timeit('hook_before_toc'): if hook_before_toc is not None: hook_before_toc(soup=d) with timeit('generate_and_add_toc'): try: generate_and_add_toc(d, raise_error=True, res=result) except NoTocPlaceholder as e: if require_toc_placeholder: msg = 'Could not find toc placeholder: %s' % e # logger.error(msg) if aug0 is not None: result.note_error(msg) else: raise Exception(msg) with timeit('document_final_pass_after_toc'): document_final_pass_after_toc( soup=d, crossrefs=crossrefs, resolve_references=resolve_references, res=result) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) with timeit('document_only_once'): document_only_once(d) location = LocationUnknown() substitute_github_refs(d, defaults={}, res=result, location=location) with timeit('another A pass'): for a in d.select('a[href]'): href = a.attrs['href'] if href in references: r = references[href] a.attrs['href'] = r.url if not a.children: # empty a.append(r.title) # do not use to_html_stripping_fragment - this is a complete doc # mark_in_html(result, soup=d) add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix) with timeit('converting to string'): res = unicode(d) with timeit('encoding'): res = res.encode('utf8') logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0))) result.set_result(res) return result