Exemplo n.º 1
0
    def build(self,
              url=None,
              wanted_list=None,
              html=None,
              request_args=None,
              update=False):
        """
        Automatically constructs a set of rules to scrape the specified target[s] from a web page.
            The rules are represented as stack_list.

        Parameters:
        ----------
        url: str, optional
            URL of the target web page. You should either pass url or html or both.

        wanted_list: list, optional
            A list of needed contents to be scraped.
                AutoScraper learns a set of rules to scrape these targets.

        html: str, optional
            An HTML string can also be passed instead of URL.
                You should either pass url or html or both.

        request_args: dict, optional
            A dictionary used to specify a set of additional request parameters used by requests
                module. You can specify proxy URLs, custom headers etc.

        update: bool, optional, defaults to False
            If True, new learned rules will be added to the previous ones.
            If False, all previously learned rules will be removed.

        Returns:
        --------
        None
        """

        soup = self._get_soup(url=url, html=html, request_args=request_args)

        result_list = []

        if update is False:
            self.stack_list = []

        wanted_list = [unicodedata.normalize("NFKD", w) for w in wanted_list]

        for wanted in wanted_list:
            children = self._get_children(soup, wanted, url)

            for child in children:
                result, stack = self._get_result_for_child(child, soup, url)
                result_list += result
                self.stack_list.append(stack)

        result_list = unique_hashable(result_list)

        if all(w in result_list for w in wanted_list):
            self.stack_list = unique_stack_list(self.stack_list)
            return result_list

        return None
Exemplo n.º 2
0
    def _clean_result(result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order):
        if not grouped and not grouped_by_alias:
            if unique is None:
                unique = True
            if keep_order:
                result_list = sorted(result_list, key=lambda x: x.index)
            result = [x.text for x in result_list]
            if unique:
                result = unique_hashable(result)
            return result

        for k, val in grouped_result.items():
            if grouped_by_alias:
                val = sorted(val, key=lambda x: x.index)
            val = [x.text for x in val]
            if unique:
                val = unique_hashable(val)
            grouped_result[k] = val

        return dict(grouped_result)
Exemplo n.º 3
0
    def _get_result_by_func(self, func, url, html, soup, request_args, grouped):
        if not soup:
            soup = self._get_soup(url=url, html=html, request_args=request_args)

        result_list = []
        grouped_result = defaultdict(list)
        for stack in self.stack_list:
            if not url:
                url = stack.get('url', '')

            result = func(stack, soup, url)

            if not grouped:
                result_list += result
                continue

            stack_id = stack['stack_id']
            grouped_result[stack_id] += result

        return dict(grouped_result) if grouped else unique_hashable(result_list)
Exemplo n.º 4
0
    def build(self,
              url=None,
              wanted_list=None,
              wanted_dict=None,
              html=None,
              request_args=None,
              update=False,
              text_fuzz_ratio=1.0):
        """
        Automatically constructs a set of rules to scrape the specified target[s] from a web page.
            The rules are represented as stack_list.

        Parameters:
        ----------
        url: str, optional
            URL of the target web page. You should either pass url or html or both.

        wanted_list: list, optional
            A list of needed contents to be scraped.
                AutoScraper learns a set of rules to scrape these targets. If specified,
                wanted_dict will be ignored.
        
        wanted_dict: dict, optional
            A dict of needed contents to be scraped. Keys are aliases and values are list of target texts.
                AutoScraper learns a set of rules to scrape these targets and sets its aliases.

        html: str, optional
            An HTML string can also be passed instead of URL.
                You should either pass url or html or both.

        request_args: dict, optional
            A dictionary used to specify a set of additional request parameters used by requests
                module. You can specify proxy URLs, custom headers etc.

        update: bool, optional, defaults to False
            If True, new learned rules will be added to the previous ones.
            If False, all previously learned rules will be removed.

        text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
            The fuzziness ratio threshold for matching the wanted contents.

        Returns:
        --------
        List of similar results
        """

        soup = self._get_soup(url=url, html=html, request_args=request_args)

        result_list = []

        if update is False:
            self.stack_list = []

        if wanted_list:
            wanted_dict = {'': wanted_list}

        wanted_list = []

        for alias, wanted_items in wanted_dict.items():
            wanted_items = [
                unicodedata.normalize("NFKD", w) for w in wanted_items
            ]
            wanted_list += wanted_items

            for wanted in wanted_items:
                children = self._get_children(soup, wanted, url,
                                              text_fuzz_ratio)

                for child in children:
                    result, stack = self._get_result_for_child(
                        child, soup, url)
                    stack['alias'] = alias
                    result_list += result
                    self.stack_list.append(stack)

        result_list = [item.text for item in result_list]
        result_list = unique_hashable(result_list)

        self.stack_list = unique_stack_list(self.stack_list)
        return result_list