示例#1
0
    def feed_headers(self):
        for ressort in self._ressorts:
            yield generate_feed_header(
                title="derStandard.at › {}".format(self._titles.get(ressort, ressort)),
                subtitle="Nachrichten in Echtzeit",
                link=f"https://www.{self.name}",
                icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/"
                "dst-16.ico",
                logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/"
                "dst-228.png",
                path=ressort,
            )

        for user_id, name in self._users.items():
            yield generate_feed_header(
                title=f"derStandard.at › Postings von {name}",
                subtitle="Nachrichten in Echtzeit",
                link="https://apps.{}/userprofil/postings/{}".format(
                    self.name, user_id
                ),
                icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico",
                logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/"
                "dst-228.png",
                path=f"userprofil/postings/{user_id}",
            )
示例#2
0
文件: orf_at.py 项目: Lukas0907/feeds
    def feed_headers(self):
        for channel in self._channels:
            channel_url = "{}.ORF.at".format(channel)
            yield generate_feed_header(
                title=channel_url,
                link="https://{}".format(channel_url.lower()),
                path=channel,
                logo=self._get_logo(channel),
            )

        for author in self._authors:
            yield generate_feed_header(title="ORF.at: {}".format(author), path=author)
示例#3
0
    def feed_headers(self):
        for channel in self._channels:
            channel_url = f"{channel}.ORF.at"
            yield generate_feed_header(
                title=channel_url,
                link=f"https://{channel_url.lower()}",
                path=channel,
                logo=self._get_logo(channel),
            )

        for author in self._authors:
            yield generate_feed_header(title=f"ORF.at: {author}", path=author)
示例#4
0
    def feed_headers(self):
        for channel in self._channels:
            channel_url = "{}.ORF.at".format(channel)
            yield generate_feed_header(
                title=channel_url,
                link="https://{}".format(channel_url.lower()),
                path=channel,
                logo=self._get_logo(channel),
            )

        for author in self._authors:
            yield generate_feed_header(title="ORF.at: {}".format(author),
                                       path=author)
示例#5
0
 def parse(self, response):
     feed = feedparser.parse(io.BytesIO(response.body))
     if "entries" not in feed or not feed["entries"]:
         self.logger.error("Feed {} contains no entries!".format(response.url))
         return
     feed_entries = feed["entries"]
     feed = feed["feed"]
     yield generate_feed_header(
         title=feed.get("title"),
         subtitle=feed.get("subtitle"),
         link=feed.get("link") or response.url,
         path=response.meta["path"],
         author_name=feed.get("author_detail", {}).get("name"),
         logo=feed.get("image", {}).get("href"),
     )
     base_url = "://".join(urlparse(response.url)[:2])
     for entry in feed_entries:
         # Deals with protocol-relative URLs.
         link = urljoin(base_url, entry["link"])
         il = FeedEntryItemLoader(base_url=base_url)
         il.add_value("path", response.meta["path"])
         il.add_value("updated", entry.get("updated") or entry.get("published"))
         il.add_value("author_name", entry.get("author_detail", {}).get("name"))
         il.add_value("link", link)
         il.add_value("category", [t["term"] for t in entry.get("tags", [])])
         if response.meta["fulltext"]:
             il.add_value("title", entry["title"])
             il.add_value("content_html", entry["content"][0]["value"])
             yield il.load_item()
         else:
             # Content is not part of the feed, scrape it.
             yield scrapy.Request(
                 link, self._parse_article, meta={"feed_entry": entry, "il": il}
             )
示例#6
0
 def feed_headers(self):
     for category in self._categories:
         yield generate_feed_header(
             title="Flimmit.com: {}".format(category.title()),
             link="https://www.{}".format(self.name),
             icon="https://www.{}/favicon.ico".format(self.name),
             path=category,
         )
示例#7
0
 def feed_headers(self):
     for category in self._categories:
         yield generate_feed_header(
             title=f"Flimmit.com: {category.title()}",
             link=f"https://www.{self.name}",
             icon=f"https://www.{self.name}/favicon.ico",
             path=category,
         )
示例#8
0
 def feed_headers(self):
     for path in self.pages:
         yield generate_feed_header(
             title="falter.at",
             subtitle="Wir holen dich da raus.",
             link="https://www.falter.at",
             path=path,
         )
示例#9
0
 def feed_headers(self):
     for section in self._sections:
         yield generate_feed_header(
             title="DiePresse.com/{}".format(section),
             link="https://{}".format(self.name),
             path=section,
             logo="http://diepresse.com/img/diepresse_250x40.png",
         )
示例#10
0
 def feed_headers(self):
     for path in self.pages:
         yield generate_feed_header(
             title="falter.at",
             subtitle="Wir holen dich da raus.",
             link="https://www.falter.at",
             path=path,
         )
示例#11
0
 def feed_headers(self):
     for section in self._sections:
         yield generate_feed_header(
             title="DiePresse.com/{}".format(section),
             link="https://{}".format(self.name),
             path=section,
             logo="http://diepresse.com/img/diepresse_250x40.png",
         )
示例#12
0
 def parse(self, response):
     page = json.loads(response.text)
     yield generate_feed_header(
         title=page["name"], link=page["link"], path=response.meta["page_id"]
     )
     for entry in page["posts"]["data"]:
         il = FeedEntryItemLoader()
         # updated_time also includes new comments not only updates to the
         # post.
         il.add_value("updated", entry["created_time"])
         il.add_value(
             "link",
             "https://www.{name}/{user_id}/posts/{post_id}".format(
                 name=self.name,
                 **dict(zip(["user_id", "post_id"], entry["id"].split("_")))
             ),
         )
         message = entry.get("message")
         name = entry.get("name")
         link = entry.get("link")
         if message:
             message = message.splitlines()
             title = message[0]
             if len(title.split()) < 10 and not title.startswith("http"):
                 # If the first line has less than ten words, it could be a
                 # title.
                 if title.upper() == title:
                     title = title.title()
                 del message[0]
             elif name and not name.startswith("http"):
                 # Fallback to the name (of the link).
                 title = name
             else:
                 # Fallback to the first ten words of the message.
                 title = " ".join(message[0].split(maxsplit=10)) + " ..."
             message = bleach.linkify("</p><p>".join(message))
             il.add_value("content_html", "<p>{}</p>".format(message))
         elif name:
             title = name
         else:
             title = link
         il.add_value("title", title)
         if link and name:
             il.add_value(
                 "content_html",
                 '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name),
             )
         picture = entry.get("picture")
         if picture:
             il.add_value(
                 "content_html",
                 '<a href="{link}"><img src="{image}"></a>'.format(
                     link=link, image=picture
                 ),
             )
         il.add_value("path", response.meta["page_id"])
         yield il.load_item()
示例#13
0
 def parse(self, response):
     page = json.loads(response.text)
     yield generate_feed_header(title=page["name"],
                                link=page["link"],
                                path=response.meta["page_id"])
     for entry in page["posts"]["data"]:
         il = FeedEntryItemLoader()
         # updated_time also includes new comments not only updates to the
         # post.
         il.add_value("updated", entry["created_time"])
         il.add_value(
             "link",
             "https://www.{name}/{user_id}/posts/{post_id}".format(
                 name=self.name,
                 **dict(zip(["user_id", "post_id"],
                            entry["id"].split("_")))),
         )
         message = entry.get("message")
         name = entry.get("name")
         link = entry.get("link")
         if message:
             message = message.splitlines()
             title = message[0]
             if len(title.split()) < 10 and not title.startswith("http"):
                 # If the first line has less than ten words, it could be a
                 # title.
                 if title.upper() == title:
                     title = title.title()
                 del message[0]
             elif name and not name.startswith("http"):
                 # Fallback to the name (of the link).
                 title = name
             else:
                 # Fallback to the first ten words of the message.
                 title = " ".join(message[0].split(maxsplit=10)) + " ..."
             message = bleach.linkify("</p><p>".join(message))
             il.add_value("content_html", "<p>{}</p>".format(message))
         elif name:
             title = name
         else:
             title = link
         il.add_value("title", title)
         if link and name:
             il.add_value(
                 "content_html",
                 '<p><a href="{link}">{name}</a></p>'.format(link=link,
                                                             name=name),
             )
         picture = entry.get("picture")
         if picture:
             il.add_value(
                 "content_html",
                 '<a href="{link}"><img src="{image}"></a>'.format(
                     link=link, image=picture),
             )
         il.add_value("path", response.meta["page_id"])
         yield il.load_item()
示例#14
0
 def feed_headers(self):
     for site in self._sites:
         yield generate_feed_header(
             title=self._titles.get(site),
             subtitle=self._subtitles.get(site),
             link=self._links.get(site),
             icon=self._icons.get(site),
             path=site,
         )
示例#15
0
 def feed_headers(self):
     yield generate_feed_header(
         title=getattr(self, "feed_title", None),
         subtitle=getattr(self, "feed_subtitle", None),
         link=getattr(self, "feed_link", None),
         path=getattr(self, "path", None),
         author_name=getattr(self, "author_name", None),
         icon=getattr(self, "icon", None),
         logo=getattr(self, "logo", None),
     )
示例#16
0
 def feed_headers(self):
     for account in self._accounts:
         yield generate_feed_header(
             title=self._titles.get(account),
             subtitle=self._subtitles.get(account),
             link=self._links.get(account),
             icon=f"{self._base_url}/site/favicon.ico",
             logo=f"{self._base_url}/site/assets/images/brand-assets/TL_logo.svg",
             path=account,
         )
示例#17
0
 def feed_headers(self):
     yield generate_feed_header(
         title=getattr(self, "feed_title", None),
         subtitle=getattr(self, "feed_subtitle", None),
         link=getattr(self, "feed_link", None),
         path=getattr(self, "path", None),
         author_name=getattr(self, "author_name", None),
         icon=getattr(self, "icon", None),
         logo=getattr(self, "logo", None),
     )
示例#18
0
    def parse_blog_overview(self, response):
        yield generate_feed_header(
            title=response.css("article > h1 ::text").extract_first(),
            link="https://www.falter.at",
            path="blog_{}".format(response.meta["blog"]),
        )

        for link in response.css("div[id^=post-] a::attr(href)").extract():
            yield scrapy.Request(link,
                                 self.parse_blog_article,
                                 meta={"blog": response.meta["blog"]})
示例#19
0
 def feed_headers(self):
     for ressort in self._ressorts:
         yield generate_feed_header(
             title="Wiener Zeitung › {}".format(
                 self._titles.get(ressort, ressort)),
             link=f"https://www.{self.name}",
             icon=f"https://www.{self.name}/_em_daten/wzo/favicon.ico",
             logo="https://www.{}/_em_daten/wzo/_layout/logo_rss.png".
             format(self.name),
             path=ressort,
         )
示例#20
0
 def feed_headers(self):
     for medium in self._media:
         yield generate_feed_header(
             title="Kurier.at",
             subtitle="Minutenaktuelle Nachrichten aus Österreich und der Welt. "
             + "kurier.at - die österreichische Nachrichten-Plattform im Internet. "
             + "24 hour news from Austria's biggest quality newspaper.",
             path=medium,
             link="https://www.{}".format(self.name),
             logo="https://{}/assets/logos/logo.png".format(self.name),
         )
示例#21
0
 def feed_headers(self):
     for ressort in self._ressorts:
         yield generate_feed_header(
             title="Oberösterreichische Nachrichten {}".format(ressort.title()),
             path=ressort,
             subtitle="OÖN",
             link="https://www.{}".format(self.name),
             icon="https://static1.{}.at/oonup/images/"
             "apple-touch-icon.png".format(self.name),
             logo="https://www.{}/pics/webapp/"
             "touchicon_180x180.png".format(self.name),
         )
示例#22
0
 def feed_headers(self):
     feeds = {"": "Addendum", "podcast": "Addendum Podcast"}
     for path, title in feeds.items():
         yield generate_feed_header(
             title=title,
             path=path,
             subtitle="das, was fehlt",
             link="https://www.{}".format(self.name),
             icon=
             ("https://www.{}/resources/dist/favicons/android-chrome-192x192.png"
              ).format(self.name),
         )
示例#23
0
 def parse(self, response):
     # Only scrape the last 8 issues.
     issues = response.css(".issues .month a::attr(href)").extract()[:8]
     yield generate_feed_header(
         title=";login:"******"The Usenix Magazine",
         link=response.url,
         path="login",
     )
     for issue in issues:
         yield scrapy.Request(response.urljoin(issue),
                              self.parse_login_issue)
示例#24
0
 def feed_headers(self):
     feeds = {"": "Addendum", "podcast": "Addendum Podcast"}
     for path, title in feeds.items():
         yield generate_feed_header(
             title=title,
             path=path,
             subtitle="das, was fehlt",
             link="https://www.{}".format(self.name),
             icon=(
                 "https://www.{}/resources/dist/favicons/android-chrome-192x192.png"
             ).format(self.name),
         )
示例#25
0
 def feed_headers(self):
     for ressort in self._ressorts:
         yield generate_feed_header(
             title=f"Oberösterreichische Nachrichten {ressort.title()}",
             path=ressort,
             subtitle="OÖN",
             link=f"https://www.{self.name}",
             icon="https://static1.{}.at/oonup/images/"
             "apple-touch-icon.png".format(self.name),
             logo="https://www.{}/pics/webapp/"
             "touchicon_180x180.png".format(self.name),
         )
示例#26
0
 def feed_headers(self):
     for ressort in self._ressorts:
         yield generate_feed_header(
             title="The Economist › {}".format(
                 self._titles.get(ressort, ressort)),
             link=f"https://www.{self.name}",
             icon="https://www.{}/engassets/ico/favicon.f1ea9088.ico".
             format(self.name),
             logo=
             ("https://www.{}/engassets/ico/touch-icon-180x180.f1ea9088.png"
              ).format(self.name),
             path=ressort,
         )
示例#27
0
 def feed_headers(self):
     for channel in self._channels:
         yield generate_feed_header(
             title=f"Ars Technica: {channel.title()}",
             link=f"https://{self.name}",
             path=channel,
             icon=(
                 "https://cdn.arstechnica.net/wp-content/uploads/2016/10/" +
                 "cropped-ars-logo-512_480-32x32.png"),
             logo=
             ("https://cdn.arstechnica.net/wp-content/themes/ars-mobile/assets/"
              + "images/material-ars.png"),
         )
示例#28
0
    def feed_headers(self):
        for ressort in self._ressorts:
            yield generate_feed_header(
                title=self._titles[ressort],
                subtitle="Nachrichten in Echtzeit",
                link="https://{}".format(self.name),
                icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico",
                logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/"
                "dst-228.png",
                path=ressort,
            )

        for user_id, name in self._users.items():
            yield generate_feed_header(
                title="derStandard.at › Postings von {}".format(name),
                subtitle="Nachrichten in Echtzeit",
                link="https://{}/userprofil/postings/{}".format(self.name, user_id),
                icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico",
                logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/"
                "dst-228.png",
                path="userprofil/postings/{}".format(user_id),
            )
示例#29
0
    def feed_headers(self):
        if not self._locales:
            return []

        for locale in self._locales:
            yield generate_feed_header(
                title=f"VICE {locale.title()}",
                path=locale,
                link=f"https://www.{self.name}",
                logo="https://www.{}/favicons/"
                "apple-touch-icon-60x60.png".format(self.name),
                icon="https://www.{}/favicons/"
                "apple-touch-icon-60x60.png".format(self.name),
            )
示例#30
0
    def feed_headers(self):
        if not self._locales:
            return []

        for locale in self._locales:
            yield generate_feed_header(
                title="VICE {}".format(locale.title()),
                path=locale,
                link="https://www.{}".format(self.name),
                logo="https://www.{}/favicons/"
                "apple-touch-icon-60x60.png".format(self.name),
                icon="https://www.{}/favicons/"
                "apple-touch-icon-60x60.png".format(self.name),
            )
示例#31
0
 def feed_headers(self):
     for channel in self._channels:
         yield generate_feed_header(
             title="Ars Technica: {}".format(channel.title()),
             link="https://{}".format(self.name),
             path=channel,
             icon=(
                 "https://cdn.arstechnica.net/wp-content/uploads/2016/10/"
                 + "cropped-ars-logo-512_480-32x32.png"
             ),
             logo=(
                 "https://cdn.arstechnica.net/wp-content/themes/ars-mobile/assets/"
                 + "images/material-ars.png"
             ),
         )
示例#32
0
    def parse(self, response):
        if len(response.css(".thumbnail")) == 0:
            self.logger.info("No items found.")
            return

        for item in response.css(".thumbnail"):
            il = FeedEntryItemLoader(selector=item, base_url=self._base_url)
            il.add_css("title", ".item_brand_text ::text")
            il.add_css("title", ".item-title ::text")
            il.add_css("title", ".current-price ::text")
            il.add_value(
                "link",
                response.urljoin(item.css(".item-link::attr(href)").extract_first()),
            )
            image_url = item.css(".item-image::attr(data-bg)").re_first(
                r"url\(([^)]+)\)"
            )
            # Fix broken images.
            if image_url.startswith("https://markenankauf.momox.de/pics/https://"):
                image_url = image_url.replace(
                    "https://markenankauf.momox.de/pics/https://", "https://"
                )
            il.add_value("content_html", '<img src="{}">'.format(image_url))
            il.add_css("content_html", ".item-des-container")
            il.add_value("path", response.meta["path"])
            yield il.load_item()

        page = int(response.css(".pagination .active a::text").extract_first())
        if page == 1:
            yield generate_feed_header(
                title=response.css("title ::text").re_first(
                    "(ubup | .*) Second Hand kaufen"
                ),
                subtitle="Deutschlands größter Second Hand-Onlineshop für "
                "Mode & Accessoires",
                icon="https://www.{}/images/favicon.ico".format(self.name),
                link=response.url,
                path=response.meta["path"],
            )
        if page < self._scrape_pages:
            next_page = response.css(
                ".pagination .active + li a::attr(href)"
            ).extract_first()
            if next_page:
                yield scrapy.Request(
                    response.urljoin(next_page),
                    meta={"dont_cache": True, "path": response.meta["path"]},
                )
示例#33
0
    def parse(self, response):
        if len(response.css(".thumbnail")) == 0:
            self.logger.info("No items found.")
            return

        for item in response.css(".thumbnail"):
            il = FeedEntryItemLoader(selector=item, base_url=self._base_url)
            il.add_css("title", ".item_brand_text ::text")
            il.add_css("title", ".item-title ::text")
            il.add_css("title", ".current-price ::text")
            il.add_value(
                "link",
                response.urljoin(item.css(".item-link::attr(href)").extract_first()),
            )
            image_url = item.css(".item-image::attr(data-bg)").re_first(
                r"url\(([^)]+)\)"
            )
            # Fix broken images.
            if image_url.startswith("https://markenankauf.momox.de/pics/https://"):
                image_url = image_url.replace(
                    "https://markenankauf.momox.de/pics/https://", "https://"
                )
            il.add_value("content_html", f'<img src="{image_url}">')
            il.add_css("content_html", ".item-des-container")
            il.add_value("path", response.meta["path"])
            yield il.load_item()

        page = int(response.css(".pagination .active a::text").extract_first())
        if page == 1:
            yield generate_feed_header(
                title=response.css("title ::text").re_first(
                    "(ubup | .*) Second Hand kaufen"
                ),
                subtitle="Deutschlands größter Second Hand-Onlineshop für "
                "Mode & Accessoires",
                icon=f"https://www.{self.name}/images/favicon.ico",
                link=response.url,
                path=response.meta["path"],
            )
        if page < self._scrape_pages:
            next_page = response.css(
                ".pagination .active + li a::attr(href)"
            ).extract_first()
            if next_page:
                yield scrapy.Request(
                    response.urljoin(next_page),
                    meta={"dont_cache": True, "path": response.meta["path"]},
                )
示例#34
0
文件: ft_com.py 项目: LPP521/PyFeeds
 def feed_headers(self):
     for ressort in self._ressorts:
         yield generate_feed_header(
             title="Financial Times › {}".format(
                 self._titles.get(ressort, ressort)),
             link="https://www.{}".format(self.name),
             icon=
             ("https://www.{}/__origami/service/image/v2/images/raw/" +
              "ftlogo-v1%3Abrand-ft-logo-square-coloured?source=update-logos"
              + "&width=32&height=32&format=png").format(self.name),
             logo=
             ("https://www.{}/__origami/service/image/v2/images/raw/" +
              "ftlogo-v1%3Abrand-ft-logo-square-coloured?source=update-logos"
              + "&width=194&height=194&format=png").format(self.name),
             path=ressort,
         )
示例#35
0
    def _parse_show(self, response):
        result = json.loads(response.text)

        yield generate_feed_header(
            title=result["name"],
            link=result["external_urls"]["spotify"],
            icon=result["images"][-1]["url"],
            logo=result["images"][0]["url"],
            path=response.meta["show"],
        )

        for episode in result["episodes"]["items"]:
            il = FeedEntryItemLoader()
            il.add_value("link", episode["external_urls"]["spotify"])
            il.add_value("updated", episode["release_date"])
            il.add_value("title", episode["name"])
            il.add_value("content_html", episode["description"])
            il.add_value("path", response.meta["show"])
            yield il.load_item()
示例#36
0
    def parse(self, response):
        for url in response.css('.item .title a::attr("href")').extract():
            yield scrapy.Request(
                url,
                self._parse_article,
                headers={"Cookie": "trackingChoice=true; choiceVersion=1"},
                meta={"path": response.meta["path"]},
            )

        yield generate_feed_header(
            title="{} Newsletter".format(
                response.css(
                    ".branding__image-icon::attr('alt')").extract_first()),
            subtitle=response.css(
                ".branding__mini-teaser ::text").extract_first(),
            link=response.url,
            logo=response.css(
                ".branding__image-icon::attr('src')").extract_first(),
            path=response.meta["path"],
        )
示例#37
0
 def parse(self, response):
     feed = feedparser.parse(io.BytesIO(response.body))
     if "entries" not in feed or not feed["entries"]:
         self.logger.error("Feed {} contains no entries!".format(
             response.url))
         return
     feed_entries = feed["entries"]
     feed = feed["feed"]
     yield generate_feed_header(
         title=feed.get("title"),
         subtitle=feed.get("subtitle"),
         link=feed.get("link") or response.url,
         path=response.meta["path"],
         author_name=feed.get("author_detail", {}).get("name"),
         logo=feed.get("image", {}).get("href"),
     )
     base_url = "://".join(urlparse(response.url)[:2])
     for entry in feed_entries:
         # Deals with protocol-relative URLs.
         link = urljoin(base_url, entry["link"])
         il = FeedEntryItemLoader(base_url=base_url)
         il.add_value("path", response.meta["path"])
         il.add_value("updated",
                      entry.get("updated") or entry.get("published"))
         il.add_value("author_name",
                      entry.get("author_detail", {}).get("name"))
         il.add_value("link", link)
         il.add_value("category",
                      [t["term"] for t in entry.get("tags", [])])
         if response.meta["fulltext"]:
             il.add_value("title", entry["title"])
             il.add_value("content_html", entry["content"][0]["value"])
             yield il.load_item()
         else:
             # Content is not part of the feed, scrape it.
             yield scrapy.Request(link,
                                  self._parse_article,
                                  meta={
                                      "feed_entry": entry,
                                      "il": il
                                  })