def parse(self, response: scrapy.http.response.html.HtmlResponse): for topic in response.css("tr a::attr(href)"): topic_url = "%s/forum/?_escaped_fragment_=topic/%s/%s" % ( self.root, self.name, self.last_part(topic.extract())) yield response.follow(topic_url, self.parse_topic) for next_page in response.css("body > a"): self.log("Page: %s -> %s" % (self.last_part(response.url), self.last_part( next_page.css("::attr(href)").extract_first()))) yield response.follow(next_page, self.parse)
def parse_topic(self, response: scrapy.http.response.html.HtmlResponse): messages = [] topic_id = self.last_part(response.url) for i, message in enumerate(response.css("tr")): topic_url = message.css( "td[class=subject] > a::attr(href)").extract_first() if topic_url is None: continue message_id = self.last_part(topic_url) messages.append({ "id": message_id, "author": message.css("td[class=author] ::text").extract_first(), "date": message.css("td[class=lastPostDate] ::text").extract_first(), "file": self.locate_email_file(topic_id, i, message_id, False) }) file_name = self.locate_email_file(topic_id, i, message_id, True) if os.path.exists(file_name): self.log("Skipped %s/%s - already fetched" % (topic_id, message_id)) continue yield response.follow( "%s/%s/message/raw?msg=%s/%s/%s" % (self.root, self.prefix, self.name, topic_id, message_id), functools.partial(self.save_email, file_name=file_name)) yield { "topic": response.css("h2 ::text").extract_first(), "id": topic_id, "messages": messages }
def parse(self, response: scrapy.http.response.html.HtmlResponse): selectors = response.xpath('//div[@class="controls"]/select/option') # current movies: base_kinodetail_url = 'https://www.berlin.de/kino/_bin/kinodetail.php/' hrefs = [ base_kinodetail_url + sel.attrib['value'] for sel in selectors if is_positiveinteger(sel.attrib['value']) ] for href in hrefs: self.logger.info(f'Scraping: {href}') yield response.follow(href, self.parse_cinema)