def parse(self, markdown): res = {} html = markdown2.markdown(markdown).rstrip() html_parsable = '<div>' + html + '</div>' document = lxml.etree.fromstring(html_parsable) for schema_elem in self.schema_dict: if schema_elem['type'] == 'titlearea': elem_xpath = cssselect.GenericTranslator().css_to_xpath( schema_elem['selector']) text_xpath = elem_xpath + '/following::p[1]' found_elems = [e for e in document.xpath(elem_xpath)] found_texts = [e.text for e in document.xpath(text_xpath)] res_key = schema_elem['name'] title = found_elems[0].text res[res_key] = {'title': title, 'description': found_texts[0]} elif schema_elem['type'] == 'list': elem_xpath = cssselect.GenericTranslator().css_to_xpath( schema_elem['selector']) text_xpath = elem_xpath + '/following::p[1]' found_elems = [e for e in document.xpath(elem_xpath)] found_texts = [e.text for e in document.xpath(text_xpath)] res_key = schema_elem['name'] title = found_elems[0].text res[res_key] = {'title': title, 'description': found_texts[0]} return res
def parse_availability(html, service_group): tr = cssselect.GenericTranslator() h = lxml.etree.HTML(html) table = None for i, e in enumerate(h.xpath(tr.css_to_xpath('.dataTitle')), -1): if service_group not in e.text: continue if 'Service State Breakdowns' not in e.text: continue table = h.xpath(tr.css_to_xpath('table.data'))[i] break services = {} average = {} if table is not None: for row in table.xpath(tr.css_to_xpath("tr.dataOdd, tr.dataEven")): if 'colspan' in row.getchildren()[0].attrib: title, ok, warn, unknown, crit, undet = row.getchildren() average = {"name": "Average", "ok": ok.text.split(' ')[0], "warning": warn.text.split(' ')[0], "unknown": unknown.text.split(' ')[0], "critical": crit.text.split(' ')[0]} continue service = parse_service_availability(row) services[service['name']] = service context = { "services": services, "average": average} return context
def parse_status(html, service_group): tr = cssselect.GenericTranslator() h = lxml.etree.HTML(html) table = None for i, e in enumerate(h.xpath(tr.css_to_xpath('.statusTitle')), -1): if service_group not in e.text: continue if 'Service Status Details' not in e.text: continue table = h.xpath(tr.css_to_xpath('table.status'))[i] break hosts = {} if table is not None: for row in table.getchildren(): children = row.getchildren() # Skip empty rows if not len(children) > 1: continue # Skip header row if children[0].tag == 'th': continue children = row.getchildren() hostname = children[0].xpath(tr.css_to_xpath('a')) if len(hostname) > 1: LOG.warning("Too many links found.") elif len(hostname) == 1: current_host = parse_hostlink(hostname[0]) hosts[current_host['hostname']] = current_host service = parse_service(children[1:]) current_host['services'].append(service) context = {"hosts": hosts} return context
def translate_css_to_xpath(*sheets): gtrans = cssselect.GenericTranslator() xpath_n_styles = [] # using a list, to be hable to "cascade". for sheet_path in sheets: stylesheet = cssutils.parseFile(sheet_path) xpath_n_styles.extend(_translate_stylesheet(stylesheet, gtrans)) return xpath_n_styles
def parse_availability(html, service_group): tr = cssselect.GenericTranslator() h = lxml.etree.HTML(html) table = None for i, e in enumerate(h.xpath(tr.css_to_xpath('.dataTitle')), -1): if service_group not in e.text: continue if 'Service State Breakdowns' not in e.text: continue table = h.xpath(tr.css_to_xpath('table.data'))[i] break services = {} average = {} if table is not None: for row in table.xpath(tr.css_to_xpath("tr.dataOdd, tr.dataEven")): if 'colspan' in row.getchildren()[0].attrib: title, ok, warn, unknown, crit, undet = row.getchildren() ok_value = (parse_percent_string(ok.text) + parse_percent_string(warn.text)) critical_value = parse_percent_string(crit.text) average = {'name': 'Average', 'ok': ok_value, 'critical': critical_value} continue service = parse_service_availability(row) services[service['name']] = service context = { 'services': services, 'average': average} return context
def selector_to_xpath(cls, selector, xmlns=None): """convert a css selector into an xpath expression. xmlns is option single-item dict with namespace prefix and href """ selector = selector.replace(' .', ' *.') if selector[0] == '.': selector = '*' + selector log.debug(selector) if '#' in selector: selector = selector.replace('#', '*#') log.debug(selector) if xmlns is not None: prefix = list(xmlns.keys())[0] href = xmlns[prefix] selector = ' '.join([ (n.strip() != '>' and prefix + '|' + n.strip() or n.strip()) for n in selector.split(' ') ]) log.debug(selector) path = cssselect.GenericTranslator().css_to_xpath(selector) path = path.replace("descendant-or-self::", "") path = path.replace("/descendant::", "//") path = path.replace('/*/', '//') log.debug(' ==> %s' % path) return path
class By(object): """ Set of supported locator strategies. """ ID = "id" XPATH = "xpath" NAME = "name" TAG_NAME = "tag name" CLASS_NAME = "class name" CSS_SELECTOR = "css selector" translator = cssselect.GenericTranslator() @classmethod def convert_selector_to_xpath(cls, by=ID, value=None): if by == cls.ID: by = cls.CSS_SELECTOR value = '[id="%s"]' % value elif by == cls.TAG_NAME: by = cls.CSS_SELECTOR elif by == cls.CLASS_NAME: by = cls.CSS_SELECTOR value = ".%s" % value elif by == cls.NAME: by = cls.CSS_SELECTOR value = '[name="%s"]' % value if by == cls.CSS_SELECTOR: value = cls.translator.css_to_xpath(value) return value
def match_selector(rule, tree): """Yield the ``(element, specificity)`` in ``tree`` matching ``rule``.""" selector_list = cssselect.parse(rule.selector.as_css()) translator = cssselect.GenericTranslator() for selector in selector_list: if not selector.pseudo_element: specificity = selector.specificity() for element in tree.xpath(translator.selector_to_xpath(selector)): yield element, specificity
def get_cinemas_raw_data(self): # get all cinemas r = cache.get_url("https://www.eventcinemas.com.au/") doc = html.fromstring(r.content) selector = css.GenericTranslator().css_to_xpath("[data-lat]") cinemasRaw = doc.xpath(selector) return cinemasRaw
def parse_service(service_columns): tr = cssselect.GenericTranslator() name = service_columns[0].xpath(tr.css_to_xpath('a'))[0].text service_name = SERVICE_NAMES.get(name) return { 'name': name, 'display_name': service_name, 'status': service_columns[1].text, 'last_checked': service_columns[2].text, 'duration': service_columns[3].text}
def services_in_maintenance(): resp = requests.get(env.nagios + "extinfo.cgi?type=6", auth=env.nagios_auth) h = lxml.etree.HTML(resp.text) tr = cssselect.GenericTranslator() nodes = h.xpath(tr.css_to_xpath("table.downtime")) services = [] if len(nodes) < 2: return services for tr in nodes[1]: if tr.getchildren()[0].tag == "th": continue if tr.getchildren()[0].text\ == 'There are no services with scheduled downtime': continue host = tr.getchildren()[0].getchildren()[0].text service = tr.getchildren()[1].getchildren()[0].text reason = tr.getchildren()[4].text down_id = tr.getchildren()[9].text services.append({"host": host, "service": service, "reason": reason, "id": down_id}) return services
def check_css(self, myfile): """Find unused CSS and undefined used CCS. """ # Fails on a few corner cases, such as # ".tdm > tbody > tr > td:first-child + td" # ".onetable td" # # Ignores @media # Find the CCS style css = myfile.tree.find('head').find('style') if css == None: return # The CSS can be in a comment or not if len(css): # Not sure whether that covers all the comment cases. Maybe add # all the children css_text = etree.tostring(css[0]).decode("utf8") else: css_text = css.text stylesheet = tinycss.make_parser().parse_stylesheet(css_text) # retrieve the errors self.cssutils_errors = [ "{0},{1}: {2}".format(err.line + css.sourceline - 1, err.column, err.reason) for err in stylesheet.errors ] css_selectors = [] for rule in stylesheet.rules: if rule.at_keyword is None: # Regular rules. # Add the selector as a string css_selectors += rule.selector.as_css().split(',') elif rule.at_keyword == '@media': # Itself is a bunch of regular rules for rule2 in rule.rules: if rule2.at_keyword is None: # Regular rules. # Add the selector as a string css_selectors += rule2.selector.as_css().split(',') css_selectors = list(set(css_selectors)) # Find the unused/undefined CSS. It is possible 2 rules will # match the same class (for instance "p.foo" and ".foo" will # match "class=foo"). That is not detected, and both rules # will be valid. self.sel_unchecked = [] self.sel_unused = [] for selector in css_selectors: # Get the selector (eg. "body", "p", ".some_class") try: sel_xpath = cssselect.GenericTranslator().css_to_xpath( selector) except (cssselect.xpath.ExpressionError, cssselect.parser.SelectorSyntaxError): self.sel_unchecked.append(selector) continue # Retrieve where it is used in the xhtml occurences = etree.XPath(sel_xpath)(myfile.tree) if len(occurences) == 0: self.sel_unused.append(selector) continue # If it's from a class, find the name. It should be the # last word starting with a dot (eg. "p.foo", ".foo", # "#toc .foo" => "foo") # # Remove a pseudo selector with rsplit, if there is one. m = re.search(r'\.([\w-]+)$', selector.rsplit(":", 1)[0]) if m == None: continue cl = m.group(1) if len(cl) == 0: continue # Mark the class wherever it is used, in each element for item in occurences: item.attrib['__used_classes'] = item.attrib.get( '__used_classes', '') + ' ' + cl # Look for unused classes self.classes_undefined = [] find = etree.XPath("//*[@class]") for element in find(myfile.tree): classes = set(element.attrib['class'].split()) used_classes = element.attrib.get('__used_classes', None) if used_classes: # Substract content of used_classes from classes # leaving classes that were not matched. classes -= set(used_classes.split()) # Finally, create the warning for cl in classes: self.classes_undefined.append([element.sourceline, cl])
def _css(self, css_selector): xpath_selector = cssselect.GenericTranslator().css_to_xpath( css_selector) return self._xpath(xpath_selector)