示例#1
0
    def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs):
        parents = [soup]
        stack_content = stack['content']
        contain_sibling_leaves = kwargs.get('contain_sibling_leaves', False)
        for index, item in enumerate(stack_content):
            children = []
            for parent in parents:

                attrs = item[1]
                if attr_fuzz_ratio < 1.0:
                    attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio)

                found = parent.findAll(item[0], attrs, recursive=False)
                if not found:
                    continue

                if not contain_sibling_leaves and index == len(stack_content) - 1:
                    idx = min(len(found) - 1, stack_content[index - 1][2])
                    found = [found[idx]]

                children += found

            parents = children

        wanted_attr = stack['wanted_attr']
        is_full_url = stack['is_full_url']
        is_non_rec_text = stack.get('is_non_rec_text', False)
        result = [ResultItem(self._fetch_result_from_child(i, wanted_attr,
                              is_full_url, url, is_non_rec_text),
                              getattr(i, 'child_index', 0)) for i in parents]
        result = [x for x in result if x.text]
        return result
示例#2
0
    def _get_result_with_stack_index_based(self, stack, soup, url,
                                           attr_fuzz_ratio, **kwargs):
        p = soup.findChildren(recursive=False)[0]
        stack_content = stack["content"]
        for index, item in enumerate(stack_content[:-1]):
            if item[0] == "[document]":
                continue
            content = stack_content[index + 1]
            attrs = content[1]
            if attr_fuzz_ratio < 1.0:
                attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio)
            p = p.findAll(content[0], attrs, recursive=False)
            if not p:
                return []
            idx = min(len(p) - 1, item[2])
            p = p[idx]

        result = [
            ResultItem(
                self._fetch_result_from_child(
                    p,
                    stack["wanted_attr"],
                    stack["is_full_url"],
                    url,
                    stack["is_non_rec_text"],
                ),
                getattr(p, "child_index", 0),
            )
        ]
        result = [x for x in result if x.text]
        return result
示例#3
0
    def _get_result_with_stack_index_based(self, stack, soup, url, **kwargs):
        p = soup.findChildren(recursive=False)[0]
        stack_content = stack['content']
        for index, item in enumerate(stack_content[:-1]):
            content = stack_content[index + 1]
            p = p.findAll(content[0], content[1], recursive=False)
            if not p:
                return []
            idx = min(len(p) - 1, item[2])
            p = p[idx]

        result = [
            ResultItem(
                self._fetch_result_from_child(p, stack['wanted_attr'],
                                              stack['is_full_url'], url),
                getattr(p, 'child_index', 0))
        ]
        result = [x for x in result if x.text]
        return result