def view_selection(self, req, resp, url): """ View the highlighted selector (from `action_view`) """ from deliverance.selector import Selector doc = document_fromstring(resp.body) el = Element('base') el.set('href', posixpath.dirname(url) + '/') doc.head.insert(0, el) selector = Selector.parse(req.GET['selector']) dummy_type, elements, dummy_attributes = selector(doc) if not elements: template = self._not_found_template else: template = self._found_template all_elements = [] els_in_head = False for index, el in enumerate(elements): el_in_head = self._el_in_head(el) if el_in_head: els_in_head = True anchor = 'deliverance-selection' if index: anchor += '-%s' % index if el.get('id'): anchor = el.get('id') ## FIXME: is a <a name> better? if not el_in_head: el.set('id', anchor) else: anchor = None ## FIXME: add :target CSS rule ## FIXME: or better, some Javascript all_elements.append((anchor, el)) if not el_in_head: style = el.get('style', '') if style: style += '; ' style += '/* deliverance */ border: 2px dotted #f00' el.set('style', style) else: el.set('DELIVERANCE-MATCH', '1') def highlight(html_code): """Highlights the given code (for use in the template)""" if isinstance(html_code, _Element): html_code = tostring(html_code) return html( pygments_highlight(html_code, HtmlLexer(), HtmlFormatter(noclasses=True))) def format_tag(tag): """Highlights the lxml HTML tag""" return highlight(tostring(tag).split('>')[0] + '>') def wrap_html(html, width=100): if isinstance(html, _Element): html = tostring(html) lines = html.splitlines() new_lines = [] def wrap_html_line(line): if len(line) <= width: return [line] match_trail = re.search(r'^[^<]*</.*?>', line, re.S) if match_trail: result = [match_trail.group(0)] result.extend(wrap_html_line(line[match_trail.end():])) return result match1 = re.search(r'^[^<]*<[^>]*>', line, re.S) match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S) if not match1 or not match2: return [line] result = [match1.group(0)] result.extend(wrap_html_line( line[match1.end():match2.start()])) result.append(match2.group(0)) return result for line in lines: new_lines.extend(wrap_html_line(line)) return '\n'.join(new_lines) def mark_deliv_match(highlighted_text): result = re.sub( r'(?:<[^/][^>]*>)*<.*?DELIVERANCE-MATCH=.*?>(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S) return html(result) text = template.substitute(base_url=url, els_in_head=els_in_head, doc=doc, elements=all_elements, selector=selector, format_tag=format_tag, highlight=highlight, wrap_html=wrap_html, mark_deliv_match=mark_deliv_match) message = fromstring( self._message_template.substitute(message=text, url=url)) if doc.body.text: message.tail = doc.body.text doc.body.text = '' doc.body.insert(0, message) text = tostring(doc) return Response(text)
def view_selection(self, req, resp, url): """ View the highlighted selector (from `action_view`) """ from deliverance.selector import Selector doc = document_fromstring(resp.body) el = Element('base') el.set('href', posixpath.dirname(url) + '/') doc.head.insert(0, el) selector = Selector.parse(req.GET['selector']) dummy_type, elements, dummy_attributes = selector(doc) if not elements: template = self._not_found_template else: template = self._found_template all_elements = [] els_in_head = False for index, el in enumerate(elements): el_in_head = self._el_in_head(el) if el_in_head: els_in_head = True anchor = 'deliverance-selection' if index: anchor += '-%s' % index if el.get('id'): anchor = el.get('id') ## FIXME: is a <a name> better? if not el_in_head: el.set('id', anchor) else: anchor = None ## FIXME: add :target CSS rule ## FIXME: or better, some Javascript all_elements.append((anchor, el)) if not el_in_head: style = el.get('style', '') if style: style += '; ' style += '/* deliverance */ border: 2px dotted #f00' el.set('style', style) else: el.set('DELIVERANCE-MATCH', '1') def highlight(html_code): """Highlights the given code (for use in the template)""" if isinstance(html_code, _Element): html_code = tostring(html_code) return html(pygments_highlight(html_code, HtmlLexer(), HtmlFormatter(noclasses=True))) def format_tag(tag): """Highlights the lxml HTML tag""" return highlight(tostring(tag).split('>')[0]+'>') def wrap_html(html, width=100): if isinstance(html, _Element): html = tostring(html) lines = html.splitlines() new_lines = [] def wrap_html_line(line): if len(line) <= width: return [line] match_trail = re.search(r'^[^<]*</.*?>', line, re.S) if match_trail: result = [match_trail.group(0)] result.extend(wrap_html_line(line[match_trail.end():])) return result match1 = re.search(r'^[^<]*<[^>]*>', line, re.S) match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S) if not match1 or not match2: return [line] result = [match1.group(0)] result.extend(wrap_html_line(line[match1.end():match2.start()])) result.append(match2.group(0)) return result for line in lines: new_lines.extend(wrap_html_line(line)) return '\n'.join(new_lines) def mark_deliv_match(highlighted_text): result = re.sub(r'(?:<[^/][^>]*>)*<.*?DELIVERANCE-MATCH=.*?>(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S) return html(result) text = template.substitute( base_url=url, els_in_head=els_in_head, doc=doc, elements=all_elements, selector=selector, format_tag=format_tag, highlight=highlight, wrap_html=wrap_html, mark_deliv_match=mark_deliv_match) message = fromstring( self._message_template.substitute(message=text, url=url)) if doc.body.text: message.tail = doc.body.text doc.body.text = '' doc.body.insert(0, message) text = tostring(doc) return Response(text)
def split_and_output(input_root, template_file_name, input_file_name, output_folder=''): output_tree = html.parse(template_file_name) # output_root = output_tree.getroot() # put element lists in dict with file_lable as the key file_lables_element_lists = {'op': [], 'new_fb': [], 'an': []} # select all the paragraphs etc within the top levle divs paragraph_elements = input_root.xpath('//body/div/*') list_to_add_to = file_lables_element_lists['op'] # look through all the paragraph elemets and find out if any are Annoncements etc for paragraph_element in paragraph_elements: # if (paragraph_element.get('class') == 'DocumentTitle' # and 'PART 2' in paragraph_element.text_content().upper()): # # start new list for future business # list_to_add_to = file_lables_element_lists['new_fb'] list_to_add_to.append(paragraph_element) # build up output trees for file_lable, element_list in file_lables_element_lists.items(): if len(element_list) != 0: # copy the template tree and add elements needed for this section temp_output_tree = deepcopy(output_tree) # temp_output_tree = html.parse(template_file_name) temp_output_root = temp_output_tree.getroot() # change the title if file_lable == 'op': title_text = 'Order Paper for ' + DATES.sitting_date_medium h1_text = 'Order Paper for ' + DATES.sitting_date_long # elif file_lable == 'new_fb': # title_text = 'Future Business as of ' + DATES.sitting_date_medium # h1_text = 'Future Business as of ' + DATES.sitting_date_long temp_output_root.xpath('//h1[@id="mainTitle"]')[0].text = h1_text temp_output_root.xpath('//head/title')[0].text = title_text # get the position (in the template) where we will inject html (from the input) code_injection_point = temp_output_root.xpath( '//div[@id="content-goes-here"]')[0] for element in element_list: # remove Future Business heading from start of part 2 if 'paraChamberSummaryHeading' in element.classes: if element.text_content().lower() == 'future business': continue # remove the docuemnt headings from the html i.e. part 1 head if 'DocumentTitle' in element.classes: text_content = element.text_content().lower() h2 = Element('h2') h2.set('class', 'OP-heading-outdent') if 'part 1' in text_content: h2.text = 'Part 1: Business Today' code_injection_point.append(h2) elif 'part 2' in text_content: h2.text = 'Part 2: Future Business' code_injection_point.append(h2) else: code_injection_point.append(element) # Add IDs and perminant ancors to the html # Added at the request of IDMS # need to get all the heading elements xpath = '//h1|//h2|//h3|//h4|//h5|//h6|//*[@class="paraBusinessItemHeading"]' \ '|//*[@class="paraBusinessItemHeading-bulleted"]|//*[@class="FbaLocation"]' headings = temp_output_root.xpath(xpath) for i, heading in enumerate(headings): # generate id text id_text = f'{DATES.sitting_date_compact}-{i}' if heading.get('id', default=None): heading.set('name', heading.get('id')) heading.set('id', id_text) # parmalink_span = SubElement(heading, 'span') # parmalink_span.set('class', 'perma-link') # anchor = SubElement(parmalink_span, 'a') anchor = SubElement(heading, 'a') permalink_for = 'Permalink for ' + heading.text_content() anchor.set('href', '#' + id_text) anchor.set('aria-label', 'Anchor') anchor.set('title', permalink_for) anchor.set('data-anchor-icon', '§') anchor.set('class', 'anchor-link') # create the tables of contents # This will be overridden by tocbot. # We still want a ToC even if JavaScript is dissabled... # find where to put the Toc nav_xpath_results = temp_output_root.xpath('//nav[@id="toc"][1]') # look for all the h2's # // Where to grab the headings to build the table of contents. # contentSelector: '.js-toc-content' h2s = temp_output_root.xpath( '//*[contains(@class, "js-toc-content")]//h2') if len(nav_xpath_results): toc_injection_point = nav_xpath_results[0] ol = SubElement(toc_injection_point, 'ol') ol.set('class', 'toc-list') for h2 in h2s: li = SubElement(ol, 'li') li.set('class', 'toc-list-item') a = SubElement(li, 'a') a.set('href', '#' + h2.get('id', '')) a.set('class', 'toc-link') a.text = h2.text_content() else: print('no element') # itterate through tree and remove CR from tail and text for element in temp_output_root.iter(): if element.tail: element.tail = element.tail.replace('\r', '') if element.text: element.text = element.text.replace('\r', '') # write out the output html files # outputfile_name = os.path.join(os.path.dirname(input_file_name), # file_lable + DATES.sitting_date_compact[2:] + fileextension) outputfile_name = f'{file_lable}{DATES.sitting_date_compact[2:]}{ fileextension}' if output_folder: outputfile_path = Path(output_folder).joinpath(outputfile_name) # print(outputfile_path) else: outputfile_path = Path(input_file_name).parent.joinpath( outputfile_name) # created element tree so we can use write method # temp_output_tree = ElementTree(temp_output_root) temp_output_tree.write(str(outputfile_path), doctype=DOCTYPE, encoding='UTF-8', method="html", xml_declaration=False) print(f'{file_lable} file is at:\t{outputfile_path}')