def build(self, url=None, wanted_list=None, html=None, request_args=None): """ Automatically constructs a set of rules to scrape the specified target[s] from a web page. The rules are represented as stack_list. Parameters: ---------- url : str, optional URL of the target web page. You should either pass url or html. wanted_list : list, optional A list of needed contents to be scraped. AutoScraper learns a set of rules to scrape these targets. html : str, optional An HTML string can also be passed instead of URL. You should either pass url or html. request_args : dict, optional A dictionary used to specify a set of additional request parameters used by requests module. You can specify proxy URLs, custom headers etc. Returns: -------- None """ self.url = url soup = self._get_soup(url=url, html=html, request_args=request_args) result_list = [] stack_list = [] for wanted in wanted_list: children = self._get_children(soup, wanted) for child in children: result, stack = self._get_result_for_child(child, soup) result_list += result stack_list.append(stack) result_list = unique(result_list) if all(w in result_list for w in wanted_list): self.stack_list = unique(stack_list) return result_list return None
def build(self, url=None, wanted_list=None, html=None, request_args=None): self.url = url soup = self._get_soup(url=url, html=html, request_args=request_args) result_list = [] stack_list = [] for wanted in wanted_list: children = self._get_children(soup, wanted) for child in children: result, stack = self._get_result_for_child(child, soup) result_list += result stack_list.append(stack) result_list = unique(result_list) if all(w in result_list for w in wanted_list): self.stack_list = unique(stack_list) return result_list return None
def get_result_similar(self, url=None, html=None, soup=None, request_args=None): if url: self.url = url if not soup: soup = self._get_soup(url=url, html=html, request_args=request_args) result = [] for stack in self.stack_list: result += self._get_result_with_stack(stack, soup) return unique(result)
def _get_result_by_func(self, func, url, html, soup, request_args, grouped): if url: self.url = url if not soup: soup = self._get_soup(url=url, html=html, request_args=request_args) result_list = [] grouped_result = defaultdict(list) for stack in self.stack_list: result = func(stack, soup) if not grouped: result_list += result continue stack_id = stack['stack_id'] grouped_result[stack_id] += result return dict(grouped_result) if grouped else unique(result_list)
def get_result_exact(self, url=None, html=None, soup=None, request_args=None): if url: self.url = url if not soup: soup = self._get_soup(url=url, html=html, request_args=request_args) result = [] for stack in self.stack_list: try: result.append( self._get_result_with_stack_index_based(stack, soup)) except IndexError: continue return unique(result)