예제 #1
0
    def filter_content(self, content: etree._Element):
        elements: List[etree._Element] = content.cssselect(','.join(
            self.element_tags))
        r: List[etree._Element] = []
        for element in elements:
            if element.text is None:
                continue
            if element.get('censored_text') is not None:
                r.append(element)
                continue

            will_append = False
            for char in self.element_strings:
                if char in element.get('censored_text', element.text):
                    will_append = True
                    continue
            if will_append:
                r.append(element)
                del will_append
                continue

            del will_append

            if self.check_element_by_rules:
                replaced_texts = self.replacer.replace_text(element.text)
                if len(replaced_texts) > 1:
                    r.append(element)
                    continue
        return r
예제 #2
0
 def _get_elements(self, *, html_etree: etree._Element):
     if self.css_select:
         elements = html_etree.cssselect(self.css_select)
     elif self.xpath_select:
         elements = html_etree.xpath(self.xpath_select)
     else:
         raise ValueError('%s field: css_select or xpath_select is expected'
                          % self.__class__.__name__)
     if not self.many:
         elements = elements[:1]
     return elements
예제 #3
0
 def _get_elements(self, *, html_etree: etree._Element, json_dict: dict):
     if self.css_selector:
         elements = html_etree.cssselect(self.css_selector)
     elif self.xpath_selector:
         elements = html_etree.xpath(self.xpath_selector)
     elif self.json_selector:
         elements = jsonpath(json_dict, self.json_selector)
         if not elements:
             elements = []
     else:
         elements = []
     if not self.many:
         elements = elements[:1]
     return elements
예제 #4
0
    def extract(self, element: Element) -> List[Element]:
        """
        Extract subelements from XML or HTML data.

        :param element: :class:`data_extractor.lxml.Element` object.

        :returns: Data or subelement.

        :raises data_extractor.exceptions.ExprError: CSS Selector Expression Error.
        """
        try:
            return element.cssselect(self.expr)
        except SelectorSyntaxError as exc:
            raise ExprError(extractor=self, exc=exc)
예제 #5
0
    def _find_elements_step(self,
                            step: list,
                            parent: ET._Element = None) -> list:
        """
        单步查找元素

        @param {list} step - 执行步骤参数,参数数组中的第0个是查找指令,其他数组对象是查找参数
            ['children']  # 获取当前清单元素中的所有子元素
            ['id', 'myId']  # 通过id获取元素
            ['xpath', '//img[@id="dracga" and @style]']  # 通过xpaht获取元素
            ['name', 'myName']  # 通过元素的name属性获取
            ['tag_name', 'img']  # 通过元素的标签名获取
            ['class_name', 'styleClass']  # 通过元素的class名获取
            ['css_selector', '#kw']  # 通过css选择器方式获取,id用#kw, class用.s_ipt, 与css的简写方式相同
                注:暂不支持
            ['link_text', 'hao123']  # 通过超文本链接上的文字信息来定位元素
            ['partial_link_text', 'hao']  # 通过超文本连接上的文字的一部分关键字信息来定位元素
        @param {lxml.etree._Element} parent=None - 父节点,如果不传代表全局搜索

        @returns {list} - 返回查找到的对象列表
            注:对象类型为 lxml.etree._Element
        """
        # 处理查找操作
        _elements = list()
        _op = step[0]  # 查找指令
        if _op == 'children':
            # 获取所有子元素
            _elements = self._find_elements_by_xpath('.//*', parent=parent)
        elif _op == 'id':
            # 通过id查找
            _elements = self._find_elements_by_xpath('.//*[@id="%s"]' %
                                                     step[1],
                                                     parent=parent)
        elif _op == 'xpath':
            _elements = self._find_elements_by_xpath(step[1], parent=parent)
        elif _op == 'name':
            _elements = self._find_elements_by_xpath('.//*[@name="%s"]' %
                                                     step[1],
                                                     parent=parent)
        elif _op == 'tag_name':
            _elements = self._find_elements_by_xpath('.//%s' % step[1],
                                                     parent=parent)
        elif _op == 'class_name':
            _elements = self._find_elements_by_xpath(
                './/*[@class="{0}" or starts-with(@class, "{0} ") or contains(@class, " {0} ") or ends-with(@class, " {0}")]'
                .format(step[1]),
                parent=parent)
        elif _op == 'link_text':
            _elements = self._find_elements_by_xpath(
                './/*[@href and text()="%s"]' % step[1], parent=parent)
        elif _op == 'partial_link_text':
            _elements = self._find_elements_by_xpath(
                './/*[@href and contains(string(), "%s")]' % step[1],
                parent=parent)
        elif _op == 'css_selector':
            if parent is None:
                parent = self.html_doc.root
            _elements = parent.cssselect(step[1])
        else:
            # 没有匹配到类型
            raise KeyError('not support find elements operator [%s]' % _op)

        return _elements
예제 #6
0
 def get_items(self, element: _Element) -> Iterator[_Element]:
     yield from element.cssselect(self.css_selector)