def _scrape_item(self): soup = self._get_soup(self.inp) yield ScrapeResults( self.inp, { "title": soup.select("title")[0].get_text(), "header": soup.select("h1")[0].get_text(), "charset": soup.select("meta[charset]")[0].get("charset") })
def _scrape_paged(self): url = self.inp while True: data, next_url = self._get_overview(url) yield ScrapeResults(url, data) if not next_url: logging.debug("No more pages, finishing up") break else: url = next_url
def _scrape_paged(self): url = self.inp index = 1 while True: page, next_url = self._parse_page(url) yield ScrapeResults(url, page) if not next_url: logging.debug("No more pages, finishing up") break else: url = next_url index = index + 1
def _scrape_paged(self): url = self.inp logging.debug(f"Starting with < {url} >") while True: data = self._get_soup(url, "#z-nvg-cognac-props") yield ScrapeResults(url, data["articles"]) if "next_page_path" not in data: logging.debug("No more pages, finishing up") break else: url = f"https://www.{self.domain}" + data["next_page_path"] logging.debug(f"Setting next url: {url}")
def _scrape_item(self): soup = self._get_soup(self.inp) data_raw = soup.select(".gallery [data-options]")[0].get("data-options") data = html.unescape(data_raw) data = json.loads(data) product_id = soup.select(".product-panel__id p")[0].get_text() yield ScrapeResults(self.inp, { "set" : data["set"], "name" : data["product"]["name"], "description" : data["product"]["description"], "price" : to_number(data["product"]["price"]), "product_id" : to_number(product_id) })
def _scrape_paged(self): offset = 0 while True: url = f"{self.inp}?offset={offset}&page-size={self.PAGE_SIZE}" try: data = self._get_search(url) except ZeroItems: logging.debug("No more pages, finishing up") break yield ScrapeResults(url, data) offset += self.PAGE_SIZE logging.debug(f"Setting next url: {url}")
def _scrape_paged(self): page = 0 while True: if "?q=" in self.inp: url = f"{self.inp}&page={page}" else: url = f"{self.inp}?q=%3Arelevance&page={page}" try: data = self._get_search(url) except ZeroItems: logging.debug("No more pages, finishing up") break yield ScrapeResults(url, data) page += 1 logging.debug(f"Setting next url: {url}")
def _scrape_item(self): page = self._parse_page(self.inp) yield ScrapeResults(self.inp, page)
def _scrape_item(self): soup = self._get_soup(self.inp) data_el = soup.select("#pdpMain")[0] yield ScrapeResults(self.inp, parse_json(data_el))
def _scrape_item(self): data = self._get_soup(self.inp, "#z-vegas-pdp-props") yield ScrapeResults(self.inp, data)
def _scrape_item(self): soup = self._get_soup(self.inp) ld = soup.select('script[type="application/ld+json"]') data = json.loads(ld[0].get_text()) yield ScrapeResults(self.inp, data)