def _parse_article(self, response): title = response.css('meta[property="og:title"]::attr(content)').extract_first() if not title: raise DropResponse( "Skipping {} because ran into bot detection".format(response.url), transient=True, ) remove_elems = [ "meta", ".ds-share-list", ".advert", ".layout-article-links", ".ds-chapter-list", ".layout-article-meta", ] change_tags = { ".article__lead-image": "figure", ".article__description": "h2", ".article__footnote": "i", } il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, ) il.add_value("link", response.url) il.add_value("title", title) il.add_css("updated", "time.article__dateline-datetime::attr('datetime')") il.add_css("content_html", ".article__lead-image") il.add_css("content_html", ".article__description") il.add_css("content_html", ".layout-article-body") il.add_value("path", response.meta["ressort"]) return il.load_item()
def _parse_stream(self, response): il = response.meta["il"] if response.status != 200: url = il.get_output_value("link") raise DropResponse(f"Skipping {url} because not downloadable yet", transient=True) yield il.load_item()
def _parse_episode(self, response): item = json.loads(response.text) il = FeedEntryItemLoader() il.add_value("title", item["title"]) il.add_value( "content_html", '<img src="{}">'.format(item["playlist"]["preview_image_url"]), ) if item["description"]: il.add_value("content_html", item["description"].replace("\r\n", "<br>")) il.add_value("updated", item["date"]) il.add_value("link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at")) # Check how many segments are part of this episode. if len(item["_embedded"]["segments"]) == 1: # If only one segment, item["sources"] contains invalid links. # We use the first embedded segment instead. # This is also how mediathekviewweb.de works. item["sources"] = item["_embedded"]["segments"][0]["sources"] try: video = next(s for s in item["sources"]["progressive_download"] if s["quality_key"] == "Q8C") il.add_value("enclosure", { "iri": video["src"], "type": "video/mp4" }) except StopIteration: self.logger.warning("Could not extract video for '{}'!".format( item["title"])) raise DropResponse( f"Skipping {response.url} because not downloadable yet", transient=True, ) subtitle = item["_embedded"].get("subtitle") if subtitle: subtitle = subtitle["_embedded"]["srt_file"]["public_urls"][ "reference"] il.add_value("enclosure", { "iri": subtitle["url"], "type": "text/plain" }) else: self.logger.debug("No subtitle file found for '{}'".format( item["url"])) il.add_value( "category", self._categories_from_oewa_base_path( item["_embedded"]["profile"]["oewa_base_path"]), ) return il.load_item()
def _parse_article_url(self, response): if not response.css("#content"): raise DropResponse( f"Skipping {response.url} since it is empty", transient=True ) if "Fehler" in response.css("h2 ::text").extract_first(): raise DropResponse( f"Skipping {response.url} since it returned an error", transient=True, ) remove_elems = ['div[style="padding-top:10px;"]'] il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url=f"https://{self.name}", dayfirst=True, remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_value("author_name", "VKI") date = response.css(".issue").re_first( r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})" ) il.add_value("updated", date) url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first( r"window\.open\('(.*)'\);" ) il.add_css("title", "h1::text") if url: return scrapy.Request( response.urljoin(url), callback=self._parse_article, meta={"il": il} ) else: il.add_value("category", "paywalled") il.add_css("content_html", ".primary") il.add_css("content_html", 'div[style="padding-top:10px;"] > h3') return il.load_item()