def parse(self, response: TextResponse, **kwargs): if self.n_pages == self.max_pages: return self.n_pages += 1 links = response.css("#recent p.title > a::attr(href)").extract() if links: for link in links: if self.db.rentals.find({ "_id": link }, { "_id": 1 }).count() == 0: yield response.follow(link, callback=self.parse) else: ad = parse_ad_page_html(response.body) with contextlib.suppress(DuplicateKeyError): self.db.rentals.insert_one(asdict(ad)) yield ad next_page = response.css(".next a") if next_page: yield response.follow(next_page[0], callback=self.parse)
def parse(self, response: TextResponse): # article = newspaper.Article(response.url) # article.set_html(response.text) # article.parse() if response.status != 200: return yield { # 'title': article.title, # 'content': article.text, # 'published_at': article.publish_date, # 'authors': article.authors, 'url': response.url, 'html': response.text } for link in LxmlLinkExtractor( allow_domains=self.domains).extract_links(response): split = urlsplit(link.url) scheme = (split.scheme + '://') if len(split.scheme) > 1 else '' if len(scheme) < 1 and len( split.netloc) > 1 and split.netloc[0] != '/': scheme = '//' url = scheme + split.netloc + split.path yield response.follow(url, callback=self.parse)
def fetch(url, meta=None, *args, **kwargs): """fetch url. """ resp = requests.get(url, *args, **kwargs, timeout=30) resp.encoding = 'UTF-8' rv = TextResponse(resp.url, status=resp.status_code, body=resp.text, encoding='UTF-8') rv.request = rv.follow(url, meta=meta) _set_response(rv) return rv
def parse(self, response: TextResponse): article = newspaper.Article(response.url) article.set_html(response.text) article.parse() yield { 'title': article.title, 'content': article.title, 'published_at': article.publish_date, 'authors': article.authors } domain = '{uri.scheme}://{uri.netloc}/'.format( uri=urlparse(response.url)) for url in LinkExtractor(allow_domains=domain).extract_links(response): yield response.follow(url, callback=self.parse)
def parse(self, response: TextResponse): # getting the data required to store in the pages table r_url = response.url r_page = response.text r_time = datetime.now() print(__file__, "CrawCategory.parse()", "scraping for pages: {}".format(r_url)) # create SQLAlchemy page object pge = Page(url=r_url, html=r_page, date=r_time, category=CrawlCategory.catObject) # add page object CrawlCategory.dbSession.add(pge) # calculating the url for the next page next_page = response.css("li.next a").attrib["href"] if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response: TextResponse) -> Generator[Dict[str, str], None, None]: chart_link = response.xpath(self.chart_link_xpath).extract_first() yield response.follow(chart_link, callback=self._parse_playlist)
def parse_teams(self, response: TextResponse): # team_links = jq.compile(".teams[].link").input(response.text).all() json_response = json.loads(response.body) team_links = jq.compile(".teams[].link").input(json_response).all() for link in team_links: yield response.follow(link, callback=self.parse_team)