def parse_html(self, path): '''Return head and content elements of the document.''' capsule = html_parser.parse(path.read(), maybe_xhtml=True) doc = etree.adopt_external_document(capsule).getroot() selectors = { 'head': 'head', 'main_content': ' '.join(['.main-column', '.section']) } return {k: doc.cssselect(sel)[0] for k, sel in selectors.items()}
def test_lxml_integration(self): capsule = html_parser.parse(b'<p id=1>xxx') root = etree.adopt_external_document(capsule).getroot() self.ae(list(root.iterchildren('body')), list(root.xpath('./body'))) self.ae(root.find('body/p').text, 'xxx') self.ae(root.xpath('//@id'), ['1']) # Test that lxml is not copying the doc internally root.set('attr', 'abc') cap2 = html_parser.clone_doc(capsule) root2 = etree.adopt_external_document(cap2).getroot() self.ae(tostring(root), tostring(root2))
def parse_html(self, fh: IO) -> Dict[str, Any]: '''Return head and content elements of the document.''' capsule = html_parser.parse(fh.read(), maybe_xhtml=True) doc = etree.adopt_external_document(capsule).getroot() result = {} result['head'] = doc.cssselect('head')[0] for candidate in ('.main-column .section', '.main__content'): elements = doc.cssselect(candidate) if elements: result['main_content'] = elements[0] break if 'main_content' not in result: raise ValueError('No main content element found') return result