예제 #1
0
    def _parse_article(self, response):
        title = response.css('meta[property="og:title"]::attr(content)').extract_first()
        if not title:
            raise DropResponse(
                "Skipping {} because ran into bot detection".format(response.url),
                transient=True,
            )

        remove_elems = [
            "meta",
            ".ds-share-list",
            ".advert",
            ".layout-article-links",
            ".ds-chapter-list",
            ".layout-article-meta",
        ]
        change_tags = {
            ".article__lead-image": "figure",
            ".article__description": "h2",
            ".article__footnote": "i",
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
        )
        il.add_value("link", response.url)
        il.add_value("title", title)
        il.add_css("updated", "time.article__dateline-datetime::attr('datetime')")
        il.add_css("content_html", ".article__lead-image")
        il.add_css("content_html", ".article__description")
        il.add_css("content_html", ".layout-article-body")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
예제 #2
0
    def _parse_stream(self, response):
        il = response.meta["il"]

        if response.status != 200:
            url = il.get_output_value("link")
            raise DropResponse(f"Skipping {url} because not downloadable yet",
                               transient=True)

        yield il.load_item()
예제 #3
0
    def _parse_episode(self, response):
        item = json.loads(response.text)
        il = FeedEntryItemLoader()
        il.add_value("title", item["title"])
        il.add_value(
            "content_html",
            '<img src="{}">'.format(item["playlist"]["preview_image_url"]),
        )
        if item["description"]:
            il.add_value("content_html",
                         item["description"].replace("\r\n", "<br>"))
        il.add_value("updated", item["date"])
        il.add_value("link", item["url"].replace("api-tvthek.orf.at",
                                                 "tvthek.orf.at"))
        # Check how many segments are part of this episode.
        if len(item["_embedded"]["segments"]) == 1:
            # If only one segment, item["sources"] contains invalid links.
            # We use the first embedded segment instead.
            # This is also how mediathekviewweb.de works.
            item["sources"] = item["_embedded"]["segments"][0]["sources"]
        try:
            video = next(s for s in item["sources"]["progressive_download"]
                         if s["quality_key"] == "Q8C")
            il.add_value("enclosure", {
                "iri": video["src"],
                "type": "video/mp4"
            })
        except StopIteration:
            self.logger.warning("Could not extract video for '{}'!".format(
                item["title"]))
            raise DropResponse(
                f"Skipping {response.url} because not downloadable yet",
                transient=True,
            )

        subtitle = item["_embedded"].get("subtitle")
        if subtitle:
            subtitle = subtitle["_embedded"]["srt_file"]["public_urls"][
                "reference"]
            il.add_value("enclosure", {
                "iri": subtitle["url"],
                "type": "text/plain"
            })
        else:
            self.logger.debug("No subtitle file found for '{}'".format(
                item["url"]))
        il.add_value(
            "category",
            self._categories_from_oewa_base_path(
                item["_embedded"]["profile"]["oewa_base_path"]),
        )
        return il.load_item()
예제 #4
0
    def _parse_article_url(self, response):
        if not response.css("#content"):
            raise DropResponse(
                f"Skipping {response.url} since it is empty", transient=True
            )

        if "Fehler" in response.css("h2 ::text").extract_first():
            raise DropResponse(
                f"Skipping {response.url} since it returned an error",
                transient=True,
            )

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url=f"https://{self.name}",
            dayfirst=True,
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", "VKI")
        date = response.css(".issue").re_first(
            r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})"
        )
        il.add_value("updated", date)
        url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"
        )
        il.add_css("title", "h1::text")
        if url:
            return scrapy.Request(
                response.urljoin(url), callback=self._parse_article, meta={"il": il}
            )
        else:
            il.add_value("category", "paywalled")
            il.add_css("content_html", ".primary")
            il.add_css("content_html", 'div[style="padding-top:10px;"] > h3')
            return il.load_item()