def unpack_element( tree: etree._ElementTree, element: Union[etree._Element, etree._ElementUnicodeResult, Any] ) -> Tuple[str, str]: """Returns path in the tree and string representation for the given XPath query element. """ if isinstance(element, etree._Element): path = tree.getpath(element) text = etree.tostring(element, encoding='unicode', pretty_print=True) else: text = str(element) try: parent = element.getparent() except AttributeError: path = '' else: path = tree.getpath(parent) return (path, text)
def _check_hidden_attributes( tree: _ElementTree, element: HtmlElement, defined_language: str) -> Optional[LanguageInfraction]: """Check for an infraction in the hidden attributes of an element. Parameters ---------- tree : _ElementTree The root element tree of the web page, used to calculate the xpath element : HtmlElement The current element to check for an infraction defined_language : str The defined language of the current element Returns ------- Optional[LanguageInfraction] Either an infraction against WCAG 3.1.2 or `None` """ for attribute_name in HIDDEN_ATTRIBUTES: if attribute_name in element.attrib: attribute_value = element.attrib[attribute_name].strip() if count_words(attribute_value) < MIN_WORDS_HIDDEN: # If the hidden attribute's value is too short, we don't check its language continue detected_language = predict_language(attribute_value) if detected_language and defined_language != detected_language: # The hidden attribute is wrong, return an infraction return LanguageInfraction( wcag_criterion="WCAG_3_1_2", xpath=tree.getpath(element), html_language=defined_language, predicted_language=detected_language, text=attribute_value, ) return None
def _dfs( tree: _ElementTree, element: HtmlElement, parent_language: str, infractions: List[LanguageInfraction], ) -> Tuple[str, Optional[str], Optional[str], List[LanguageInfraction]]: """Check for infractions against WCAG 3.1.2 using a recursive DFS. Parameters ---------- tree : _ElementTree The root element tree of the web page, used to calculate the xpath element : HtmlElement The current element to check for infractions parent_language : str The defined language of the current element's parent infractions : List[LanguageInfraction] The infractions found until now Returns ------- Tuple[str, Optional[str], Optional[str], List[LanguageInfraction]] A tuple containing: - the explicitly defined language of the current element - the detected language of the current element - the text of the current element - the infractions found """ # If the current element does not have a `lang` attribute, take the parent's language defined_language = element.get("lang", parent_language).lower()[:2] # The text contained in this element (and in this element alone) text = (element.text or "").replace("\n", " ").strip() # Check for infractions against WCAG 3.1.2 in hidden attributes hidden_infraction = _check_hidden_attributes(tree, element, defined_language) if hidden_infraction is not None: infractions.append(hidden_infraction) # Check for infractions against WCAG 3.1.2 in the current element's children children = element.getchildren() if children: # Run this function on each of the children (= recursion) children_results = [ _dfs(tree, child, defined_language, infractions) for child in children ] # Differ between children that are similar and children that are different if len({(defined, detected) for defined, detected, _, _ in children_results}) == 1: # All children have the same language defined and detected child_defined_language, child_detected_language, _, _ = children_results[ 0] children_text = " ".join( str(child_result[2]) for child_result in children_results) if child_detected_language and child_defined_language != child_detected_language: # All children are wrong # Give a warning for the current element instead of for each of its children infractions.append( LanguageInfraction( wcag_criterion="WCAG_3_1_2", xpath=tree.getpath(element), html_language=child_defined_language, predicted_language=child_detected_language, text=children_text, )) else: # The children have different values for their defined and detected languages for child, child_result in zip(children, children_results): child_defined_language, child_detected_language, child_text, _ = child_result if child_detected_language and child_detected_language != child_defined_language: # This child is wrong, give a warning for only this child infractions.append( LanguageInfraction( wcag_criterion="WCAG_3_1_2", xpath=tree.getpath(child), html_language=child_defined_language, predicted_language=child_detected_language, text=str(child_text), )) # If any of the children of the current element contains a very short piece of text, add it # to the current element's text for _, _, child_text, _ in children_results: if child_text is None: continue child_text = child_text.strip() # We only add the child text if it is short if count_words(child_text) >= MIN_WORDS_DEFAULT: continue current_text = (text or "").strip() # We only add the child text if the current text doesn't end with the child text if not current_text.endswith(child_text): text = current_text + " " + child_text # If the current element's text is long enough, predict its language if count_words(text) >= MIN_WORDS_DEFAULT: detected_language = predict_language(text) else: detected_language = None return defined_language, detected_language, text, infractions