def parse_html(self, url, css_selector, timeout=10, cache=True): html = self._http_get(url, timeout=timeout, cache=cache) document = html5lib.parse(html) results = ElementWrapper.from_html_root(document).query_all( css_selector) data = [result.etree_element.text for result in results] return data
def test_html_select(selector, result): assert not [ element.etree_element.get('id', 'nil') for element in ElementWrapper.from_xml_root(IDS_ROOT).query_all(selector) ] assert result == [ element.etree_element.get('id', 'nil') for element in ElementWrapper.from_html_root(IDS_ROOT).query_all(selector) ]
def select_ids(selector, html_only): xml_ids = [element.etree_element.get('id', 'nil') for element in ElementWrapper.from_xml_root(root).query_all(selector)] html_ids = [element.etree_element.get('id', 'nil') for element in ElementWrapper.from_html_root(root).query_all(selector)] if html_only: assert xml_ids == [] else: assert xml_ids == html_ids return html_ids
def parse_course(content: Text) -> Dict[Text, Any]: """ Parse HTML markup of course page. :param content: :return: dict-like collection of data """ root: ElementWrapper = ElementWrapper.from_html_root( html5lib.parse(content)) description_node = root.query('section#Description' '>div#Description-subsection-0') selector_second_p = 'p:nth-child(2)' selector_first_p = 'p:nth-child(1)' prerequisites_node: ElementWrapper = description_node.query( selector_second_p) node: ElementWrapper = description_node.query(selector_first_p) text = node.etree_element.text data = generate_mapping(prerequisites_node, text) return data
def test_lang(): doc = etree.fromstring(''' <html xmlns="http://www.w3.org/1999/xhtml"></html> ''') assert not ElementWrapper.from_xml_root(doc).matches(':lang(fr)') doc = etree.fromstring(''' <html xmlns="http://www.w3.org/1999/xhtml"> <meta http-equiv="Content-Language" content=" fr \t"/> </html> ''') root = ElementWrapper.from_xml_root(doc, content_language='en') assert root.matches(':lang(fr)') doc = etree.fromstring(''' <html> <meta http-equiv="Content-Language" content=" fr \t"/> </html> ''') root = ElementWrapper.from_xml_root(doc, content_language='en') assert root.matches(':lang(en)') doc = etree.fromstring('<html></html>') root = ElementWrapper.from_xml_root(doc, content_language='en') assert root.matches(':lang(en)') root = ElementWrapper.from_xml_root(doc, content_language='en, es') assert not root.matches(':lang(en)') root = ElementWrapper.from_xml_root(doc) assert not root.matches(':lang(en)') doc = etree.fromstring('<html lang="eN"></html>') root = ElementWrapper.from_html_root(doc) assert root.matches(':lang(en)') doc = etree.fromstring('<html lang="eN"></html>') root = ElementWrapper.from_xml_root(doc) assert not root.matches(':lang(en)')
""" import xml.etree.ElementTree as etree from pathlib import Path import pytest from cssselect2 import ElementWrapper, SelectorError, compile_selector_list from .w3_selectors import invalid_selectors, valid_selectors CURRENT_FOLDER = Path(__file__).parent IDS_ROOT = etree.parse(CURRENT_FOLDER / 'ids.html') ALL_IDS = [ element.etree_element.get('id', 'nil') for element in ElementWrapper.from_html_root(IDS_ROOT).query_all('*') ] SHAKESPEARE_BODY = (ElementWrapper.from_xml_root( etree.parse( CURRENT_FOLDER / 'shakespeare.html').find('.//{http://www.w3.org/1999/xhtml}body'))) def get_test_document(): document = etree.parse(CURRENT_FOLDER / 'content.xhtml') parent = document.find(".//*[@id='root']") # Setup namespace tests for id in ('any-namespace', 'no-namespace'): div = etree.SubElement(parent, '{http://www.w3.org/1999/xhtml}div') div.set('id', id)
if len(val) == 1: return val[0] elif len(val) > 1: return val else: if required: raise RequirementMissing(item) return None items = [] src = sys.stdin.read() t = html5lib.parse(src, namespaceHTMLElements=False) doc = ElementWrapper.from_html_root(t) for scraper in scrapers: item_selector = scraper.get("item") if args.verbose: print ("Running scraper {0}".format(item_selector), file=sys.stderr) for item_elt in doc.query_all(item_selector): if args.verbose: print ("ITEM: {0}".format(tag_open(item_elt.etree_element)), file=sys.stderr) item_elt.etree_element.set("itemscope", "itemscope") # item_elt = match.etree_element try: item = {} for key, selector in scraper['keys'].items(): if args.verbose: print ("key, selector: {0}, {1}".format(key, selector), file=sys.stderr) #continue item[key] = item_selection(item_elt, selector, key, verbose=args.verbose)