def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title="derStandard.at › {}".format(self._titles.get(ressort, ressort)), subtitle="Nachrichten in Echtzeit", link=f"https://www.{self.name}", icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/" "dst-16.ico", logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/" "dst-228.png", path=ressort, ) for user_id, name in self._users.items(): yield generate_feed_header( title=f"derStandard.at › Postings von {name}", subtitle="Nachrichten in Echtzeit", link="https://apps.{}/userprofil/postings/{}".format( self.name, user_id ), icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico", logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/" "dst-228.png", path=f"userprofil/postings/{user_id}", )
def feed_headers(self): for channel in self._channels: channel_url = "{}.ORF.at".format(channel) yield generate_feed_header( title=channel_url, link="https://{}".format(channel_url.lower()), path=channel, logo=self._get_logo(channel), ) for author in self._authors: yield generate_feed_header(title="ORF.at: {}".format(author), path=author)
def feed_headers(self): for channel in self._channels: channel_url = f"{channel}.ORF.at" yield generate_feed_header( title=channel_url, link=f"https://{channel_url.lower()}", path=channel, logo=self._get_logo(channel), ) for author in self._authors: yield generate_feed_header(title=f"ORF.at: {author}", path=author)
def feed_headers(self): for channel in self._channels: channel_url = "{}.ORF.at".format(channel) yield generate_feed_header( title=channel_url, link="https://{}".format(channel_url.lower()), path=channel, logo=self._get_logo(channel), ) for author in self._authors: yield generate_feed_header(title="ORF.at: {}".format(author), path=author)
def parse(self, response): feed = feedparser.parse(io.BytesIO(response.body)) if "entries" not in feed or not feed["entries"]: self.logger.error("Feed {} contains no entries!".format(response.url)) return feed_entries = feed["entries"] feed = feed["feed"] yield generate_feed_header( title=feed.get("title"), subtitle=feed.get("subtitle"), link=feed.get("link") or response.url, path=response.meta["path"], author_name=feed.get("author_detail", {}).get("name"), logo=feed.get("image", {}).get("href"), ) base_url = "://".join(urlparse(response.url)[:2]) for entry in feed_entries: # Deals with protocol-relative URLs. link = urljoin(base_url, entry["link"]) il = FeedEntryItemLoader(base_url=base_url) il.add_value("path", response.meta["path"]) il.add_value("updated", entry.get("updated") or entry.get("published")) il.add_value("author_name", entry.get("author_detail", {}).get("name")) il.add_value("link", link) il.add_value("category", [t["term"] for t in entry.get("tags", [])]) if response.meta["fulltext"]: il.add_value("title", entry["title"]) il.add_value("content_html", entry["content"][0]["value"]) yield il.load_item() else: # Content is not part of the feed, scrape it. yield scrapy.Request( link, self._parse_article, meta={"feed_entry": entry, "il": il} )
def feed_headers(self): for category in self._categories: yield generate_feed_header( title="Flimmit.com: {}".format(category.title()), link="https://www.{}".format(self.name), icon="https://www.{}/favicon.ico".format(self.name), path=category, )
def feed_headers(self): for category in self._categories: yield generate_feed_header( title=f"Flimmit.com: {category.title()}", link=f"https://www.{self.name}", icon=f"https://www.{self.name}/favicon.ico", path=category, )
def feed_headers(self): for path in self.pages: yield generate_feed_header( title="falter.at", subtitle="Wir holen dich da raus.", link="https://www.falter.at", path=path, )
def feed_headers(self): for section in self._sections: yield generate_feed_header( title="DiePresse.com/{}".format(section), link="https://{}".format(self.name), path=section, logo="http://diepresse.com/img/diepresse_250x40.png", )
def feed_headers(self): for path in self.pages: yield generate_feed_header( title="falter.at", subtitle="Wir holen dich da raus.", link="https://www.falter.at", path=path, )
def feed_headers(self): for section in self._sections: yield generate_feed_header( title="DiePresse.com/{}".format(section), link="https://{}".format(self.name), path=section, logo="http://diepresse.com/img/diepresse_250x40.png", )
def parse(self, response): page = json.loads(response.text) yield generate_feed_header( title=page["name"], link=page["link"], path=response.meta["page_id"] ) for entry in page["posts"]["data"]: il = FeedEntryItemLoader() # updated_time also includes new comments not only updates to the # post. il.add_value("updated", entry["created_time"]) il.add_value( "link", "https://www.{name}/{user_id}/posts/{post_id}".format( name=self.name, **dict(zip(["user_id", "post_id"], entry["id"].split("_"))) ), ) message = entry.get("message") name = entry.get("name") link = entry.get("link") if message: message = message.splitlines() title = message[0] if len(title.split()) < 10 and not title.startswith("http"): # If the first line has less than ten words, it could be a # title. if title.upper() == title: title = title.title() del message[0] elif name and not name.startswith("http"): # Fallback to the name (of the link). title = name else: # Fallback to the first ten words of the message. title = " ".join(message[0].split(maxsplit=10)) + " ..." message = bleach.linkify("</p><p>".join(message)) il.add_value("content_html", "<p>{}</p>".format(message)) elif name: title = name else: title = link il.add_value("title", title) if link and name: il.add_value( "content_html", '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name), ) picture = entry.get("picture") if picture: il.add_value( "content_html", '<a href="{link}"><img src="{image}"></a>'.format( link=link, image=picture ), ) il.add_value("path", response.meta["page_id"]) yield il.load_item()
def parse(self, response): page = json.loads(response.text) yield generate_feed_header(title=page["name"], link=page["link"], path=response.meta["page_id"]) for entry in page["posts"]["data"]: il = FeedEntryItemLoader() # updated_time also includes new comments not only updates to the # post. il.add_value("updated", entry["created_time"]) il.add_value( "link", "https://www.{name}/{user_id}/posts/{post_id}".format( name=self.name, **dict(zip(["user_id", "post_id"], entry["id"].split("_")))), ) message = entry.get("message") name = entry.get("name") link = entry.get("link") if message: message = message.splitlines() title = message[0] if len(title.split()) < 10 and not title.startswith("http"): # If the first line has less than ten words, it could be a # title. if title.upper() == title: title = title.title() del message[0] elif name and not name.startswith("http"): # Fallback to the name (of the link). title = name else: # Fallback to the first ten words of the message. title = " ".join(message[0].split(maxsplit=10)) + " ..." message = bleach.linkify("</p><p>".join(message)) il.add_value("content_html", "<p>{}</p>".format(message)) elif name: title = name else: title = link il.add_value("title", title) if link and name: il.add_value( "content_html", '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name), ) picture = entry.get("picture") if picture: il.add_value( "content_html", '<a href="{link}"><img src="{image}"></a>'.format( link=link, image=picture), ) il.add_value("path", response.meta["page_id"]) yield il.load_item()
def feed_headers(self): for site in self._sites: yield generate_feed_header( title=self._titles.get(site), subtitle=self._subtitles.get(site), link=self._links.get(site), icon=self._icons.get(site), path=site, )
def feed_headers(self): yield generate_feed_header( title=getattr(self, "feed_title", None), subtitle=getattr(self, "feed_subtitle", None), link=getattr(self, "feed_link", None), path=getattr(self, "path", None), author_name=getattr(self, "author_name", None), icon=getattr(self, "icon", None), logo=getattr(self, "logo", None), )
def feed_headers(self): for account in self._accounts: yield generate_feed_header( title=self._titles.get(account), subtitle=self._subtitles.get(account), link=self._links.get(account), icon=f"{self._base_url}/site/favicon.ico", logo=f"{self._base_url}/site/assets/images/brand-assets/TL_logo.svg", path=account, )
def feed_headers(self): yield generate_feed_header( title=getattr(self, "feed_title", None), subtitle=getattr(self, "feed_subtitle", None), link=getattr(self, "feed_link", None), path=getattr(self, "path", None), author_name=getattr(self, "author_name", None), icon=getattr(self, "icon", None), logo=getattr(self, "logo", None), )
def parse_blog_overview(self, response): yield generate_feed_header( title=response.css("article > h1 ::text").extract_first(), link="https://www.falter.at", path="blog_{}".format(response.meta["blog"]), ) for link in response.css("div[id^=post-] a::attr(href)").extract(): yield scrapy.Request(link, self.parse_blog_article, meta={"blog": response.meta["blog"]})
def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title="Wiener Zeitung › {}".format( self._titles.get(ressort, ressort)), link=f"https://www.{self.name}", icon=f"https://www.{self.name}/_em_daten/wzo/favicon.ico", logo="https://www.{}/_em_daten/wzo/_layout/logo_rss.png". format(self.name), path=ressort, )
def feed_headers(self): for medium in self._media: yield generate_feed_header( title="Kurier.at", subtitle="Minutenaktuelle Nachrichten aus Österreich und der Welt. " + "kurier.at - die österreichische Nachrichten-Plattform im Internet. " + "24 hour news from Austria's biggest quality newspaper.", path=medium, link="https://www.{}".format(self.name), logo="https://{}/assets/logos/logo.png".format(self.name), )
def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title="Oberösterreichische Nachrichten {}".format(ressort.title()), path=ressort, subtitle="OÖN", link="https://www.{}".format(self.name), icon="https://static1.{}.at/oonup/images/" "apple-touch-icon.png".format(self.name), logo="https://www.{}/pics/webapp/" "touchicon_180x180.png".format(self.name), )
def feed_headers(self): feeds = {"": "Addendum", "podcast": "Addendum Podcast"} for path, title in feeds.items(): yield generate_feed_header( title=title, path=path, subtitle="das, was fehlt", link="https://www.{}".format(self.name), icon= ("https://www.{}/resources/dist/favicons/android-chrome-192x192.png" ).format(self.name), )
def parse(self, response): # Only scrape the last 8 issues. issues = response.css(".issues .month a::attr(href)").extract()[:8] yield generate_feed_header( title=";login:"******"The Usenix Magazine", link=response.url, path="login", ) for issue in issues: yield scrapy.Request(response.urljoin(issue), self.parse_login_issue)
def feed_headers(self): feeds = {"": "Addendum", "podcast": "Addendum Podcast"} for path, title in feeds.items(): yield generate_feed_header( title=title, path=path, subtitle="das, was fehlt", link="https://www.{}".format(self.name), icon=( "https://www.{}/resources/dist/favicons/android-chrome-192x192.png" ).format(self.name), )
def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title=f"Oberösterreichische Nachrichten {ressort.title()}", path=ressort, subtitle="OÖN", link=f"https://www.{self.name}", icon="https://static1.{}.at/oonup/images/" "apple-touch-icon.png".format(self.name), logo="https://www.{}/pics/webapp/" "touchicon_180x180.png".format(self.name), )
def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title="The Economist › {}".format( self._titles.get(ressort, ressort)), link=f"https://www.{self.name}", icon="https://www.{}/engassets/ico/favicon.f1ea9088.ico". format(self.name), logo= ("https://www.{}/engassets/ico/touch-icon-180x180.f1ea9088.png" ).format(self.name), path=ressort, )
def feed_headers(self): for channel in self._channels: yield generate_feed_header( title=f"Ars Technica: {channel.title()}", link=f"https://{self.name}", path=channel, icon=( "https://cdn.arstechnica.net/wp-content/uploads/2016/10/" + "cropped-ars-logo-512_480-32x32.png"), logo= ("https://cdn.arstechnica.net/wp-content/themes/ars-mobile/assets/" + "images/material-ars.png"), )
def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title=self._titles[ressort], subtitle="Nachrichten in Echtzeit", link="https://{}".format(self.name), icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico", logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/" "dst-228.png", path=ressort, ) for user_id, name in self._users.items(): yield generate_feed_header( title="derStandard.at › Postings von {}".format(name), subtitle="Nachrichten in Echtzeit", link="https://{}/userprofil/postings/{}".format(self.name, user_id), icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico", logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/" "dst-228.png", path="userprofil/postings/{}".format(user_id), )
def feed_headers(self): if not self._locales: return [] for locale in self._locales: yield generate_feed_header( title=f"VICE {locale.title()}", path=locale, link=f"https://www.{self.name}", logo="https://www.{}/favicons/" "apple-touch-icon-60x60.png".format(self.name), icon="https://www.{}/favicons/" "apple-touch-icon-60x60.png".format(self.name), )
def feed_headers(self): if not self._locales: return [] for locale in self._locales: yield generate_feed_header( title="VICE {}".format(locale.title()), path=locale, link="https://www.{}".format(self.name), logo="https://www.{}/favicons/" "apple-touch-icon-60x60.png".format(self.name), icon="https://www.{}/favicons/" "apple-touch-icon-60x60.png".format(self.name), )
def feed_headers(self): for channel in self._channels: yield generate_feed_header( title="Ars Technica: {}".format(channel.title()), link="https://{}".format(self.name), path=channel, icon=( "https://cdn.arstechnica.net/wp-content/uploads/2016/10/" + "cropped-ars-logo-512_480-32x32.png" ), logo=( "https://cdn.arstechnica.net/wp-content/themes/ars-mobile/assets/" + "images/material-ars.png" ), )
def parse(self, response): if len(response.css(".thumbnail")) == 0: self.logger.info("No items found.") return for item in response.css(".thumbnail"): il = FeedEntryItemLoader(selector=item, base_url=self._base_url) il.add_css("title", ".item_brand_text ::text") il.add_css("title", ".item-title ::text") il.add_css("title", ".current-price ::text") il.add_value( "link", response.urljoin(item.css(".item-link::attr(href)").extract_first()), ) image_url = item.css(".item-image::attr(data-bg)").re_first( r"url\(([^)]+)\)" ) # Fix broken images. if image_url.startswith("https://markenankauf.momox.de/pics/https://"): image_url = image_url.replace( "https://markenankauf.momox.de/pics/https://", "https://" ) il.add_value("content_html", '<img src="{}">'.format(image_url)) il.add_css("content_html", ".item-des-container") il.add_value("path", response.meta["path"]) yield il.load_item() page = int(response.css(".pagination .active a::text").extract_first()) if page == 1: yield generate_feed_header( title=response.css("title ::text").re_first( "(ubup | .*) Second Hand kaufen" ), subtitle="Deutschlands größter Second Hand-Onlineshop für " "Mode & Accessoires", icon="https://www.{}/images/favicon.ico".format(self.name), link=response.url, path=response.meta["path"], ) if page < self._scrape_pages: next_page = response.css( ".pagination .active + li a::attr(href)" ).extract_first() if next_page: yield scrapy.Request( response.urljoin(next_page), meta={"dont_cache": True, "path": response.meta["path"]}, )
def parse(self, response): if len(response.css(".thumbnail")) == 0: self.logger.info("No items found.") return for item in response.css(".thumbnail"): il = FeedEntryItemLoader(selector=item, base_url=self._base_url) il.add_css("title", ".item_brand_text ::text") il.add_css("title", ".item-title ::text") il.add_css("title", ".current-price ::text") il.add_value( "link", response.urljoin(item.css(".item-link::attr(href)").extract_first()), ) image_url = item.css(".item-image::attr(data-bg)").re_first( r"url\(([^)]+)\)" ) # Fix broken images. if image_url.startswith("https://markenankauf.momox.de/pics/https://"): image_url = image_url.replace( "https://markenankauf.momox.de/pics/https://", "https://" ) il.add_value("content_html", f'<img src="{image_url}">') il.add_css("content_html", ".item-des-container") il.add_value("path", response.meta["path"]) yield il.load_item() page = int(response.css(".pagination .active a::text").extract_first()) if page == 1: yield generate_feed_header( title=response.css("title ::text").re_first( "(ubup | .*) Second Hand kaufen" ), subtitle="Deutschlands größter Second Hand-Onlineshop für " "Mode & Accessoires", icon=f"https://www.{self.name}/images/favicon.ico", link=response.url, path=response.meta["path"], ) if page < self._scrape_pages: next_page = response.css( ".pagination .active + li a::attr(href)" ).extract_first() if next_page: yield scrapy.Request( response.urljoin(next_page), meta={"dont_cache": True, "path": response.meta["path"]}, )
def feed_headers(self): for ressort in self._ressorts: yield generate_feed_header( title="Financial Times › {}".format( self._titles.get(ressort, ressort)), link="https://www.{}".format(self.name), icon= ("https://www.{}/__origami/service/image/v2/images/raw/" + "ftlogo-v1%3Abrand-ft-logo-square-coloured?source=update-logos" + "&width=32&height=32&format=png").format(self.name), logo= ("https://www.{}/__origami/service/image/v2/images/raw/" + "ftlogo-v1%3Abrand-ft-logo-square-coloured?source=update-logos" + "&width=194&height=194&format=png").format(self.name), path=ressort, )
def _parse_show(self, response): result = json.loads(response.text) yield generate_feed_header( title=result["name"], link=result["external_urls"]["spotify"], icon=result["images"][-1]["url"], logo=result["images"][0]["url"], path=response.meta["show"], ) for episode in result["episodes"]["items"]: il = FeedEntryItemLoader() il.add_value("link", episode["external_urls"]["spotify"]) il.add_value("updated", episode["release_date"]) il.add_value("title", episode["name"]) il.add_value("content_html", episode["description"]) il.add_value("path", response.meta["show"]) yield il.load_item()
def parse(self, response): for url in response.css('.item .title a::attr("href")').extract(): yield scrapy.Request( url, self._parse_article, headers={"Cookie": "trackingChoice=true; choiceVersion=1"}, meta={"path": response.meta["path"]}, ) yield generate_feed_header( title="{} Newsletter".format( response.css( ".branding__image-icon::attr('alt')").extract_first()), subtitle=response.css( ".branding__mini-teaser ::text").extract_first(), link=response.url, logo=response.css( ".branding__image-icon::attr('src')").extract_first(), path=response.meta["path"], )
def parse(self, response): feed = feedparser.parse(io.BytesIO(response.body)) if "entries" not in feed or not feed["entries"]: self.logger.error("Feed {} contains no entries!".format( response.url)) return feed_entries = feed["entries"] feed = feed["feed"] yield generate_feed_header( title=feed.get("title"), subtitle=feed.get("subtitle"), link=feed.get("link") or response.url, path=response.meta["path"], author_name=feed.get("author_detail", {}).get("name"), logo=feed.get("image", {}).get("href"), ) base_url = "://".join(urlparse(response.url)[:2]) for entry in feed_entries: # Deals with protocol-relative URLs. link = urljoin(base_url, entry["link"]) il = FeedEntryItemLoader(base_url=base_url) il.add_value("path", response.meta["path"]) il.add_value("updated", entry.get("updated") or entry.get("published")) il.add_value("author_name", entry.get("author_detail", {}).get("name")) il.add_value("link", link) il.add_value("category", [t["term"] for t in entry.get("tags", [])]) if response.meta["fulltext"]: il.add_value("title", entry["title"]) il.add_value("content_html", entry["content"][0]["value"]) yield il.load_item() else: # Content is not part of the feed, scrape it. yield scrapy.Request(link, self._parse_article, meta={ "feed_entry": entry, "il": il })