示例#1
0
 def parse_html(self, url, css_selector, timeout=10, cache=True):
     html = self._http_get(url, timeout=timeout, cache=cache)
     document = html5lib.parse(html)
     results = ElementWrapper.from_html_root(document).query_all(
         css_selector)
     data = [result.etree_element.text for result in results]
     return data
示例#2
0
def test_html_select(selector, result):
    assert not [
        element.etree_element.get('id', 'nil') for element in
        ElementWrapper.from_xml_root(IDS_ROOT).query_all(selector)
    ]
    assert result == [
        element.etree_element.get('id', 'nil') for element in
        ElementWrapper.from_html_root(IDS_ROOT).query_all(selector)
    ]
示例#3
0
 def select_ids(selector, html_only):
     xml_ids = [element.etree_element.get('id', 'nil') for element in
                ElementWrapper.from_xml_root(root).query_all(selector)]
     html_ids = [element.etree_element.get('id', 'nil') for element in
                 ElementWrapper.from_html_root(root).query_all(selector)]
     if html_only:
         assert xml_ids == []
     else:
         assert xml_ids == html_ids
     return html_ids
示例#4
0
def parse_course(content: Text) -> Dict[Text, Any]:
    """
    Parse HTML markup of course page.

    :param content:
    :return: dict-like collection of data
    """
    root: ElementWrapper = ElementWrapper.from_html_root(
        html5lib.parse(content))
    description_node = root.query('section#Description'
                                  '>div#Description-subsection-0')
    selector_second_p = 'p:nth-child(2)'
    selector_first_p = 'p:nth-child(1)'
    prerequisites_node: ElementWrapper = description_node.query(
        selector_second_p)
    node: ElementWrapper = description_node.query(selector_first_p)
    text = node.etree_element.text
    data = generate_mapping(prerequisites_node, text)
    return data
示例#5
0
def test_lang():
    doc = etree.fromstring('''
        <html xmlns="http://www.w3.org/1999/xhtml"></html>
    ''')
    assert not ElementWrapper.from_xml_root(doc).matches(':lang(fr)')

    doc = etree.fromstring('''
        <html xmlns="http://www.w3.org/1999/xhtml">
            <meta http-equiv="Content-Language" content=" fr \t"/>
        </html>
    ''')
    root = ElementWrapper.from_xml_root(doc, content_language='en')
    assert root.matches(':lang(fr)')

    doc = etree.fromstring('''
        <html>
            <meta http-equiv="Content-Language" content=" fr \t"/>
        </html>
    ''')
    root = ElementWrapper.from_xml_root(doc, content_language='en')
    assert root.matches(':lang(en)')

    doc = etree.fromstring('<html></html>')
    root = ElementWrapper.from_xml_root(doc, content_language='en')
    assert root.matches(':lang(en)')

    root = ElementWrapper.from_xml_root(doc, content_language='en, es')
    assert not root.matches(':lang(en)')

    root = ElementWrapper.from_xml_root(doc)
    assert not root.matches(':lang(en)')

    doc = etree.fromstring('<html lang="eN"></html>')
    root = ElementWrapper.from_html_root(doc)
    assert root.matches(':lang(en)')

    doc = etree.fromstring('<html lang="eN"></html>')
    root = ElementWrapper.from_xml_root(doc)
    assert not root.matches(':lang(en)')
示例#6
0
"""

import xml.etree.ElementTree as etree
from pathlib import Path

import pytest
from cssselect2 import ElementWrapper, SelectorError, compile_selector_list

from .w3_selectors import invalid_selectors, valid_selectors

CURRENT_FOLDER = Path(__file__).parent
IDS_ROOT = etree.parse(CURRENT_FOLDER / 'ids.html')
ALL_IDS = [
    element.etree_element.get('id', 'nil')
    for element in ElementWrapper.from_html_root(IDS_ROOT).query_all('*')
]
SHAKESPEARE_BODY = (ElementWrapper.from_xml_root(
    etree.parse(
        CURRENT_FOLDER /
        'shakespeare.html').find('.//{http://www.w3.org/1999/xhtml}body')))


def get_test_document():
    document = etree.parse(CURRENT_FOLDER / 'content.xhtml')
    parent = document.find(".//*[@id='root']")

    # Setup namespace tests
    for id in ('any-namespace', 'no-namespace'):
        div = etree.SubElement(parent, '{http://www.w3.org/1999/xhtml}div')
        div.set('id', id)
示例#7
0

    if len(val) == 1:
        return val[0]
    elif len(val) > 1:
        return val
    else:
        if required:
            raise RequirementMissing(item)
        return None

items = []

src = sys.stdin.read()
t = html5lib.parse(src, namespaceHTMLElements=False)
doc = ElementWrapper.from_html_root(t)

for scraper in scrapers:
    item_selector = scraper.get("item")
    if args.verbose: print ("Running scraper {0}".format(item_selector), file=sys.stderr)
    for item_elt in doc.query_all(item_selector):
        if args.verbose: print ("ITEM: {0}".format(tag_open(item_elt.etree_element)), file=sys.stderr)
        item_elt.etree_element.set("itemscope", "itemscope")
        # item_elt = match.etree_element
        try:
            item = {}
            for key, selector in scraper['keys'].items():
                if args.verbose:
                    print ("key, selector: {0}, {1}".format(key, selector), file=sys.stderr)
                #continue
                item[key] = item_selection(item_elt, selector, key, verbose=args.verbose)