def _extract_common_features(self, node): features = {"link_length" : 0, "link_length_bak" : 0, "link_count" : 0, "image_link_count" : 0, "short_link_count" : 0, "text_length" : 0, "large_text_count" : 0, "image_count" : 0} if node.tag == "a": if self._classifiers["valid_link_classifier"].classify(node): self._extract_link_features(node, features) else: self._hide_node(node) return False, None elif node.tag == "img": features["image_count"] = 1 return True, features elif node.tag == "style": #move internal styles in <body> to <head> if self._config["operation_switches"]["move_internal_styles"]: self._move_internal_styles(node) return False, None elif node.tag == "script": if self._config["operation_switches"]["drop_scripts"]: node.drop_tree() return False, None elif node.tag in self._config["skipped_tags"]: return False, None features["text_length"] = label_count(remove_space(node.text.strip())) if node.text is not None else 0 + label_count(remove_space(node.tail.strip())) if node.tail is not None else 0 if features["text_length"] >= self._config["large_text_threshold"]: features["large_text_count"] = 1 return True, features
def is_empty_node(cls, node, default_empty_tags, invisible_tags): """ Check if a node is empty """ if node.tag not in default_empty_tags: text_length = len(remove_space(node.text_content())) # children_length = len(node.getchildren()) children_length = len(filter(lambda child: child not in invisible_tags, node.getchildren())) return children_length == 0 and text_length == 0 return False