Exemplo n.º 1
0
    def build(self, url=None, wanted_list=None, html=None, request_args=None):
        """
        Automatically constructs a set of rules to scrape the specified target[s] from a web page.
            The rules are represented as stack_list.

        Parameters:
        ----------
        url : str, optional
            URL of the target web page. You should either pass url or html.
        wanted_list : list, optional
            A list of needed contents to be scraped.
                AutoScraper learns a set of rules to scrape these targets.

        html : str, optional
            An HTML string can also be passed instead of URL.
                You should either pass url or html.

        request_args : dict, optional
            A dictionary used to specify a set of additional request parameters used by requests
                module. You can specify proxy URLs, custom headers etc.

        Returns:
        --------
        None
        """

        self.url = url
        soup = self._get_soup(url=url, html=html, request_args=request_args)

        result_list = []
        stack_list = []

        for wanted in wanted_list:
            children = self._get_children(soup, wanted)

            for child in children:
                result, stack = self._get_result_for_child(child, soup)
                result_list += result
                stack_list.append(stack)

        result_list = unique(result_list)

        if all(w in result_list for w in wanted_list):
            self.stack_list = unique(stack_list)
            return result_list

        return None
Exemplo n.º 2
0
    def build(self, url=None, wanted_list=None, html=None, request_args=None):
        self.url = url
        soup = self._get_soup(url=url, html=html, request_args=request_args)

        result_list = []
        stack_list = []

        for wanted in wanted_list:
            children = self._get_children(soup, wanted)

            for child in children:
                result, stack = self._get_result_for_child(child, soup)
                result_list += result
                stack_list.append(stack)

        result_list = unique(result_list)

        if all(w in result_list for w in wanted_list):
            self.stack_list = unique(stack_list)
            return result_list

        return None
Exemplo n.º 3
0
    def get_result_similar(self,
                           url=None,
                           html=None,
                           soup=None,
                           request_args=None):
        if url:
            self.url = url

        if not soup:
            soup = self._get_soup(url=url,
                                  html=html,
                                  request_args=request_args)

        result = []
        for stack in self.stack_list:
            result += self._get_result_with_stack(stack, soup)

        return unique(result)
Exemplo n.º 4
0
    def _get_result_by_func(self, func, url, html, soup, request_args,
                            grouped):
        if url:
            self.url = url

        if not soup:
            soup = self._get_soup(url=url,
                                  html=html,
                                  request_args=request_args)

        result_list = []
        grouped_result = defaultdict(list)
        for stack in self.stack_list:
            result = func(stack, soup)

            if not grouped:
                result_list += result
                continue

            stack_id = stack['stack_id']
            grouped_result[stack_id] += result

        return dict(grouped_result) if grouped else unique(result_list)
Exemplo n.º 5
0
    def get_result_exact(self,
                         url=None,
                         html=None,
                         soup=None,
                         request_args=None):
        if url:
            self.url = url

        if not soup:
            soup = self._get_soup(url=url,
                                  html=html,
                                  request_args=request_args)

        result = []
        for stack in self.stack_list:
            try:
                result.append(
                    self._get_result_with_stack_index_based(stack, soup))

            except IndexError:
                continue

        return unique(result)