def filter_content(self, content: etree._Element): elements: List[etree._Element] = content.cssselect(','.join( self.element_tags)) r: List[etree._Element] = [] for element in elements: if element.text is None: continue if element.get('censored_text') is not None: r.append(element) continue will_append = False for char in self.element_strings: if char in element.get('censored_text', element.text): will_append = True continue if will_append: r.append(element) del will_append continue del will_append if self.check_element_by_rules: replaced_texts = self.replacer.replace_text(element.text) if len(replaced_texts) > 1: r.append(element) continue return r
def _get_elements(self, *, html_etree: etree._Element): if self.css_select: elements = html_etree.cssselect(self.css_select) elif self.xpath_select: elements = html_etree.xpath(self.xpath_select) else: raise ValueError('%s field: css_select or xpath_select is expected' % self.__class__.__name__) if not self.many: elements = elements[:1] return elements
def _get_elements(self, *, html_etree: etree._Element, json_dict: dict): if self.css_selector: elements = html_etree.cssselect(self.css_selector) elif self.xpath_selector: elements = html_etree.xpath(self.xpath_selector) elif self.json_selector: elements = jsonpath(json_dict, self.json_selector) if not elements: elements = [] else: elements = [] if not self.many: elements = elements[:1] return elements
def extract(self, element: Element) -> List[Element]: """ Extract subelements from XML or HTML data. :param element: :class:`data_extractor.lxml.Element` object. :returns: Data or subelement. :raises data_extractor.exceptions.ExprError: CSS Selector Expression Error. """ try: return element.cssselect(self.expr) except SelectorSyntaxError as exc: raise ExprError(extractor=self, exc=exc)
def _find_elements_step(self, step: list, parent: ET._Element = None) -> list: """ 单步查找元素 @param {list} step - 执行步骤参数,参数数组中的第0个是查找指令,其他数组对象是查找参数 ['children'] # 获取当前清单元素中的所有子元素 ['id', 'myId'] # 通过id获取元素 ['xpath', '//img[@id="dracga" and @style]'] # 通过xpaht获取元素 ['name', 'myName'] # 通过元素的name属性获取 ['tag_name', 'img'] # 通过元素的标签名获取 ['class_name', 'styleClass'] # 通过元素的class名获取 ['css_selector', '#kw'] # 通过css选择器方式获取,id用#kw, class用.s_ipt, 与css的简写方式相同 注:暂不支持 ['link_text', 'hao123'] # 通过超文本链接上的文字信息来定位元素 ['partial_link_text', 'hao'] # 通过超文本连接上的文字的一部分关键字信息来定位元素 @param {lxml.etree._Element} parent=None - 父节点,如果不传代表全局搜索 @returns {list} - 返回查找到的对象列表 注:对象类型为 lxml.etree._Element """ # 处理查找操作 _elements = list() _op = step[0] # 查找指令 if _op == 'children': # 获取所有子元素 _elements = self._find_elements_by_xpath('.//*', parent=parent) elif _op == 'id': # 通过id查找 _elements = self._find_elements_by_xpath('.//*[@id="%s"]' % step[1], parent=parent) elif _op == 'xpath': _elements = self._find_elements_by_xpath(step[1], parent=parent) elif _op == 'name': _elements = self._find_elements_by_xpath('.//*[@name="%s"]' % step[1], parent=parent) elif _op == 'tag_name': _elements = self._find_elements_by_xpath('.//%s' % step[1], parent=parent) elif _op == 'class_name': _elements = self._find_elements_by_xpath( './/*[@class="{0}" or starts-with(@class, "{0} ") or contains(@class, " {0} ") or ends-with(@class, " {0}")]' .format(step[1]), parent=parent) elif _op == 'link_text': _elements = self._find_elements_by_xpath( './/*[@href and text()="%s"]' % step[1], parent=parent) elif _op == 'partial_link_text': _elements = self._find_elements_by_xpath( './/*[@href and contains(string(), "%s")]' % step[1], parent=parent) elif _op == 'css_selector': if parent is None: parent = self.html_doc.root _elements = parent.cssselect(step[1]) else: # 没有匹配到类型 raise KeyError('not support find elements operator [%s]' % _op) return _elements
def get_items(self, element: _Element) -> Iterator[_Element]: yield from element.cssselect(self.css_selector)