def runTest(self): f = open(doc_fn, 'rb') c = f.read() f.close() doc = html.document_fromstring(c) body = doc.xpath('//body')[0] bad = [] selector, count = self.selectors[self.index] options = dict(regex_prefix='re') xpath = cssselect.css_to_xpath(cssselect.parse(selector, options), **options) try: results = body.xpath(xpath, namespaces=namespaces) except Exception: e = sys.exc_info()[1] e.args = ("%s for xpath %r" % (e, xpath), ) raise found = {} for item in results: if item in found: assert 0, ("Element shows up multiple times: %r" % item) found[item] = None if isinstance(results, basestring): assert 0, ("Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath))) if len(results) != count: #if self.shortDescription() == 'div.character, div.dialog': # import pdb; pdb.set_trace() assert 0, ( "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath)))
def runTest(self): f = open(doc_fn, "rb") c = f.read() f.close() doc = html.document_fromstring(c) body = doc.xpath("//body")[0] bad = [] selector, count = self.selectors[self.index] xpath = cssselect.css_to_xpath(cssselect.parse(selector)) try: results = body.xpath(xpath) except Exception: e = sys.exc_info()[1] e.args = "%s for xpath %r" % (e, xpath) raise found = {} for item in results: if item in found: assert 0, "Element shows up multiple times: %r" % item found[item] = None if isinstance(results, basestring): assert 0, "Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath)) if len(results) != count: # if self.shortDescription() == 'div.character, div.dialog': # import pdb; pdb.set_trace() assert 0, "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath))
def match_selectors_against_html_root_element(selectors, html_element): ''' Find the selectors that match with the DOM from the given HTML. @param selectors set of CSS selectors (strings) @param html_element lxml.etree.Element object @return set of found selectors ''' found_selectors = set() css_to_xpath_translator = CssDeadwoodHtmlTranslator() for selector_str in selectors: try: # Instead of just calling css_to_xpath(selector_str), # we first convert the css selector string to a cssselect.Selector instance # to pass to selector_to_xpath(), so we can properly ignore pseudo elements. # Note that cssselect.parse() always returns a list, so we do a for loop. for selector in cssselect.parse(selector_str): selector.pseudo_element = None xpath_expr = css_to_xpath_translator.selector_to_xpath( selector) if len(html_element.xpath(xpath_expr)) > 0: found_selectors.add(selector_str) except Exception: global _log _log.exception('lxml css select failed on selector %r' % selector_str) return found_selectors
def selector_to_xpath(selector): """Return ``pseudo_type, selector_callable`` from a cssutils ``selector``. ``pseudo_type`` is a string and ``selector_callable`` is a :class:`lxml.cssselect` XPath callable. """ try: return selector._x_weasyprint_parsed_cssselect except AttributeError: parsed_selector = cssselect.parse(selector.selectorText) # cssutils made sure that `selector` is not a "group of selectors" # in CSS3 terms (`rule.selectorList` is) so `parsed_selector` cannot be # of type `cssselect.Or`. # This leaves only three cases: # - The selector ends with a pseudo-element. As `cssselect.parse()` # parses left-to-right, `parsed_selector` is a `cssselect.Pseudo` # instance that we can unwrap. This is the only place where CSS # allows pseudo-element selectors. # - The selector has a pseudo-element not at the end. This is invalid # and the whole ruleset should be ignored. # cssselect.CSSSelector() will raise a cssselect.ExpressionError. # - The selector has no pseudo-element and is supported by # `cssselect.CSSSelector`. if isinstance(parsed_selector, cssselect.CombinedSelector): simple_selector = parsed_selector.subselector if isinstance(simple_selector, cssselect.Pseudo) \ and simple_selector.ident in PSEUDO_ELEMENTS: pseudo_type = str(simple_selector.ident) # Remove the pseudo-element from the selector parsed_selector.subselector = simple_selector.element else: # No pseudo-element or invalid selector. pseudo_type = None else: if isinstance(parsed_selector, cssselect.Pseudo) \ and parsed_selector.ident in PSEUDO_ELEMENTS: pseudo_type = str(parsed_selector.ident) # Remove the pseudo-element from the selector parsed_selector = parsed_selector.element else: # No pseudo-element or invalid selector. pseudo_type = None selector_callable = cssselect.CSSSelector(parsed_selector) result = (pseudo_type, selector_callable) # Cache for next time we use the same stylesheet selector._x_weasyprint_parsed_cssselect = result return result