def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url='{}/cms/'.format(self._link), timezone=self._timezone, remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr', 'h7'], remove_elems_xpath=['//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]'], ) il.add_value( 'title', response.xpath('//head/title/text()').re_first(r'::: (.*)')) il.add_value('link', response.url) il.add_value( 'updated', response.xpath('//div[@class="news-single-rightbox"]'). re_first(r'(\d{2}\.\d{2}\.\d{4})')) il.add_value( 'author_name', response.xpath('//head/meta[@name="publisher"]/@content'). re_first('recht.at, (.*);')) il.add_xpath('author_name', '//head/meta[@name="author"]/@content') il.add_value('author_name', self.name) il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content') il.add_css('content_html', '.news-single-item h7 font strong') il.add_css('content_html', '.news-single-item') yield il.load_item()
def parse_item(self, response): author_date = " ".join(response.css(".author-date ::text").extract()) match = re.search(r"von\s+(.*)", author_date) author_name = match.group(1) if match else "Red." remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", "hr", "center div[style='padding: 10px; background:#efefef']", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def parse(self, response): json_response = json.loads(response.text) if 'next' in json_response['_links']: yield Request(json_response['_links']['nextPage'], meta={'dont_cache': True}) for item in json_response['_embedded']['items']: il = FeedEntryItemLoader(response=response, timezone=self._timezone, dayfirst=False) il.add_value('title', item['title']) il.add_value( 'content_html', '<img src="{}">'.format(item['playlist']['preview_image_url'])) if item['description']: il.add_value('content_html', item['description'].replace('\r\n', '<br>')) il.add_value('updated', item['date']) il.add_value( 'link', item['url'].replace('api-tvthek.orf.at', 'tvthek.orf.at')) yield Request(item['_links']['profile']['href'], self._parse_profile, meta={'item': il}, dont_filter=True)
def parse(self, response): articles = json.loads(response.text) for article in articles: il = FeedEntryItemLoader() il.add_value('title', article['title']) il.add_value('link', article['url']) if 'thumbnail_url_1_1' in article: il.add_value( 'content_html', '<img src="{}">'.format(article['thumbnail_url_1_1'])) il.add_value('content_html', article['body']) il.add_value('updated', delorean.epoch(article['publish_date'] / 1000)) il.add_value('author_name', [ contribution['contributor']['full_name'] for contribution in article['contributions'] ]) il.add_value('category', article['channel']['name']) for topic in article['topics'] + [article['primary_topic']]: if topic and 'name' in topic: il.add_value('category', topic['name'].title()) if article['nsfw']: il.add_value('category', 'nsfw') if article['nsfb']: il.add_value('category', 'nsfb') il.add_value('path', response.meta['locale']) yield il.load_item()
def parse(self, response): m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text) restaurants = sorted( json.loads(m.group(1))["restaurants"]["entities"].values(), key=lambda r: int(r["created"]), reverse=True, ) for restaurant in restaurants[:20]: il = FeedEntryItemLoader(timezone="UTC", base_url=response.url) url = response.urljoin(restaurant["url"]) il.add_value("link", url) il.add_value("title", restaurant["name"]) content = """ <img src="{image}"> <ul> <li>{address}</li> <li>{price_range_human}</li> <li>{cuisine_text}</li> </ul> """ il.add_value("content_html", content.format(**restaurant)) il.add_value("updated", datetime.utcfromtimestamp(int(restaurant["created"]))) yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
def parse_movies(self, response): entries = json.loads(response.text)["hits"] for entry in entries: il = FeedEntryItemLoader(response=response, base_url="https://{}".format(self.name)) il.add_value("path", "{}".format(response.meta["movies"])) il.add_value( "link", "https://www.{}/kino/{}".format(self.name, entry["prod_id"])) il.add_value("title", entry["prod"]) il.add_value("content_html", entry["comment"]) for image in entry["images"] or []: il.add_value( "content_html", '<img src="https://faltercdn2.falter.at/events/1080/{}">'. format(image["filename"]), ) if "stream" in entry: il.add_value("content_html", '<a href="{s}">{s}</a>'.format(s=entry["stream"])) for key, value in entry.items(): if key.startswith("has_") and value: il.add_value("category", key.replace("has_", "")) elif key.startswith("is_") and value: il.add_value("category", key.replace("is_", "")) il.add_value("updated", entry["index_date"]) yield il.load_item()
def _parse_article(self, response): title = response.css('meta[property="og:title"]::attr(content)').extract_first() if not title: raise DropResponse( "Skipping {} because ran into bot detection".format(response.url), transient=True, ) remove_elems = [ "meta", ".ds-share-list", ".advert", ".layout-article-links", ".ds-chapter-list", ".layout-article-meta", ] change_tags = { ".article__lead-image": "figure", ".article__description": "h2", ".article__footnote": "i", } il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, ) il.add_value("link", response.url) il.add_value("title", title) il.add_css("updated", "time.article__dateline-datetime::attr('datetime')") il.add_css("content_html", ".article__lead-image") il.add_css("content_html", ".article__description") il.add_css("content_html", ".layout-article-body") il.add_value("path", response.meta["ressort"]) return il.load_item()
def _parse_video_page(self, response): match = re.search( r"https?://(?:www\.)?servustv\.com/videos/(?P<id>[aA]{2}-\w+|\d+-\d+)", response.url, ) if not match: return video_id = match.group("id").upper() il = FeedEntryItemLoader(response=response) il.add_value("link", response.url) section = response.css( "meta[property='article:section']::attr('content')").extract_first( ) if section != "Allgemein": il.add_value("title", section) il.add_css("title", "title::text", re="(.*) - Servus TV") image_url = response.css( "meta[property='og:image']::attr('content')").extract_first() il.add_value("content_html", '<img src="{}">'.format(image_url)) il.add_css("content_html", "meta[property='og:description']::attr('content')") il.add_css("content_html", "#media-asset-content-container") match = re.search(r'"dateModified":\s*"([^"]+)"', response.text) if match: il.add_value("updated", match.group(1)) stream_url = "https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8" % video_id yield Request(stream_url, self._parse_stream, meta={"il": il})
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) try: response.text except AttributeError: # Response is not text (e.g. PDF, ...). il.add_value("title", feed_entry.get("title")) il.add_value("content_html", feed_entry.get("summary")) return il.load_item() doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader( response=response, base_url=f"https://{self.name}", timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "title", '//meta[@name="title"]/@content', re=r"(?s)(.*?)(?: vom .*)? - puls4\.com", ) il.add_value( "updated", "{} {}".format( response.xpath('//meta[@name="title"]/@content').re_first( r".*vom (\d{2}\.\d{2}\.\d{4}).*"), response.meta["time"] or "00:00", ), ) il.add_value( "content_html", '<img src="{}">'.format( response.xpath( '//meta[@property="og:image"]/@content').extract_first()), ) il.add_css("content_html", ".player-video-description-intro::text") return il.load_item()
def parse_item(self, response): remove_elems = [ 'aside', 'script', 'h1', '.breadcrumbs', '.author-date', '.artikel-social-kommentar', '.bild-copyright', '.ressortTitleMobile', '.article-number', '.artikel-kommentarlink', '.umfrage-wrapper', '.articleIssueInfo', ] il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='http://{}'.format(self.name), remove_elems=remove_elems) il.add_value('link', response.url) author_name = ( response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)') or 'Red.') il.add_value('author_name', author_name) il.add_css('title', 'h1[itemprop="headline"]::text') il.add_css('updated', 'meta[property="article:published_time"]::attr(content)', re='([^+]*)') il.add_css('content_html', 'article') yield il.load_item()
def parse_broadcast(self, response): broadcast = json.loads(response.text) il = FeedEntryItemLoader(response=response, timezone=self._timezone, dayfirst=False) link = 'https://{}/programm/{}/{}'.format(self.name, response.meta['oe1_day'], broadcast['programKey']) il.add_value('link', link) il.add_value('title', broadcast['programTitle']) il.add_value('title', broadcast['title']) if broadcast.get('streams'): stream = 'http://loopstream01.apa.at/?channel=oe1&id={}'.format( broadcast['streams'][0]['loopStreamId']) il.add_value('enclosure_iri', stream) il.add_value('enclosure_type', 'audio/mpeg') il.add_value('updated', broadcast['niceTimeISO']) if broadcast['subtitle']: il.add_value('content_html', '<strong>{}</strong>'.format(broadcast['subtitle'])) for item in broadcast['items']: if 'title' in item: il.add_value('content_html', '<h3>{}</h3>'.format(item['title'])) il.add_value('content_html', item.get('description')) il.add_value('content_html', broadcast['description']) yield il.load_item()
def parse_node(self, response, node): il = FeedEntryItemLoader(response=response, base_url=f"https://{self.name}") updated = dateutil_parse(node.xpath("dc:date/text()").extract_first()) il.add_value("updated", updated) title = node.xpath("rss:title/text()").extract_first() paywalled = title.startswith("[$]") if paywalled: title = title.replace("[$] ", "") il.add_value("category", "paywalled") link = node.xpath("rss:link/text()").extract_first() link = link.replace("rss", "") link = link.replace("http://", "https://") meta = {"il": il} if paywalled and not self._subscribed: il.add_value("title", title) il.add_value("author_name", node.xpath("dc:creator/text()").extract_first()) il.add_value("content_text", node.xpath("rss:description/text()").extract_first()) il.add_value("link", link) return il.load_item() else: if "LWN.net Weekly Edition for" in title: meta["updated"] = updated callback = self._parse_weekly_edition link += "bigpage" else: callback = self._parse_article # Don't include link yet, we will use the subscriber link later. # So subscriber articles can be shared from the feed reader and # read in browser without logging in. return scrapy.Request(link, callback, meta=meta)
def _parse_article(self, response): if response.status == 410: # Articles has been deleted. return remove_elems = [ '.bildtext .author', 'iframe', ] change_tags = { 'h1': 'h2' } il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, change_tags=change_tags, dayfirst=False, yearfirst=False) if response.css('.payment'): il.add_value('category', 'paywalled') il.add_css('link', 'link[rel="canonical"]::attr(href)') il.add_css('title', 'meta[property="og:title"]::attr(content)') il.add_css('author_name', '.druckheadline::text', re='·\s*(.*)\s*·') il.add_css('updated', 'meta[http-equiv="last-modified"]::attr(content)') il.add_css('content_html', '.druckcontent') il.add_value('path', response.meta['ressort']) yield il.load_item()
def parse_album(self, response): def _replace_track_info(elem): parts = list( map(lambda x: x.text_content().strip(), elem.getchildren())) return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1]) title = response.xpath('//h1[@class="c-product-block__title"]//text()' ).extract()[-1].strip() artist = response.xpath( '//div[contains(@class,"c-product-block__contributors")]/p/text()' ).re_first('[^,]+') il = FeedEntryItemLoader( response=response, base_url="https://{}/".format(self.name), remove_elems=[ '.c-product-block__title', '.c-product__product-purchase', '.c-track__format-specific-info', '.c-track__duration', '.c-track__details', '.c-tracklist__initial-tracks', '.c-tabs-block__tabs-links', 'button' ], replace_elems={'.c-track__all-format-info': _replace_track_info}) il.add_value("title", '{} - {}'.format(artist, title)) il.add_value("link", response.url) il.add_value("author_name", 'bot') il.add_css("content_html", 'div.c-page--product') return il.load_item()
def _parse_article_url(self, response): if 'Fehler' in response.css('h2 ::text').extract_first(): self.logger.info('Skipping {} as it returned an error'.format( response.url)) return remove_elems = ['div[style="padding-top:10px;"]'] il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='http://{}'.format(self.name), dayfirst=True, remove_elems=remove_elems) il.add_value('link', response.url) il.add_value('author_name', 'VKI') date = response.css('.issue').re_first( 'veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})') il.add_value('updated', date) url = (response.xpath('//a[text()="Druckversion"]/@onclick').re_first( r"window\.open\('(.*)'\);")) il.add_css('title', 'h1::text') if url: yield scrapy.Request(response.urljoin(url), callback=self._parse_article, meta={'il': il}) else: il.add_value('category', 'paywalled') il.add_css('content_html', '.primary') il.add_css('content_html', 'div[style="padding-top:10px;"] > h3') yield il.load_item()
def _parse_article(self, response): remove_elems = [ ".caption-credit", ".gallery-image-credit", "#social-left", "ul.toc", "h3:contains('Table of Contents')", "br", ".sidebar:contains('Further Reading')", ".credit", ] change_tags = {".sidebar": "blockquote", "aside": "blockquote"} replace_elems = {"div.image": self._div_to_img} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) if response.meta.get("first_page", False): il.add_value("link", response.url) il.add_css("author_name", ".byline a span ::text") il.add_css("content_html", "header h2") il.add_value("path", response.meta["path"]) il.add_css("content_html", ".article-content") if response.css(".next"): return scrapy.Request( response.css(".numbers a::attr(href)").extract()[-1], self._parse_article, meta={"il": il, "path": response.meta["path"]}, ) else: return il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): if "data-original" in elem.attrib: elem.attrib["src"] = elem.attrib["data-original"] return elem remove_elems = [ ".credit", ".hide-caption", ".toggle-caption", ".enlarge-options", ".enlarge_measure", ".enlarge_html", ".ad-backstage", 'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")', 'p:contains("Did you enjoy this newsletter segment?")', ] replace_elems = {"img": _fix_img_src} change_tags = {".image": "figure", ".credit-caption": "figcaption"} il = FeedEntryItemLoader( response=response, base_url=self._base_url, remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) il.add_css("title", "h1 ::text") il.add_value("link", response.url) il.add_css("content_html", "#storytext") il.add_value("path", response.meta["path"]) il.add_css("updated", '.dateblock time::attr("datetime")') il.add_css("author_name", ".byline__name a::text") yield il.load_item()
def parse_item(self, response): remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) author_name = ( response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red.") il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader(response=response, parent=response.meta["il"], base_url=self._base_url) il.add_value("content_html", "<h1>Detailed Changelog</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_archive_search(self, response): articles = json.loads(response.text)["articles"]["hits"] for i, item in enumerate(articles): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", ) il.add_value("path", "magazine") link = response.urljoin(item["detail_link"]) il.add_value("link", link) try: author = re.sub( r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title() ) il.add_value("author_name", author) except IndexError: pass il.add_value("title", item["title"]) # All articles have the same date. # We add an offset so they are sorted in the right order. date = response.meta["issue_date"] + timedelta(seconds=i) il.add_value("updated", date) il.add_value("category", item["ressort"]) yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
def _parse_article(self, response): remove_elems = ['#issue', 'h1', '#slogan', '#logo', '#footer'] il = FeedEntryItemLoader(response=response, parent=response.meta['il'], base_url='http://{}'.format(self.name), remove_elems=remove_elems) il.add_css('content_html', '#page') yield il.load_item()
def _parse_article(self, response): remove_elems = ['iframe', 'script'] il = FeedEntryItemLoader(response=response, parent=response.meta['il'], remove_elems=remove_elems, base_url='http://{}'.format(self.name)) il.add_css('content_html', '.entry-content') return il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader( response=response, parent=response.meta['il'], base_url=self._base_url, ) il.add_value('content_html', '<h1>Detailed Changelog</h1>') il.add_xpath('content_html', '//h1/following-sibling::*') yield il.load_item()
def parse(self, response): page = json.loads(response.text) yield generate_feed_header(title=page["name"], link=page["link"], path=response.meta["page_id"]) for entry in page["posts"]["data"]: il = FeedEntryItemLoader() # updated_time also includes new comments not only updates to the # post. il.add_value("updated", entry["created_time"]) il.add_value( "link", "https://www.{name}/{user_id}/posts/{post_id}".format( name=self.name, **dict(zip(["user_id", "post_id"], entry["id"].split("_")))), ) message = entry.get("message") name = entry.get("name") link = entry.get("link") if message: message = message.splitlines() title = message[0] if len(title.split()) < 10 and not title.startswith("http"): # If the first line has less than ten words, it could be a # title. if title.upper() == title: title = title.title() del message[0] elif name and not name.startswith("http"): # Fallback to the name (of the link). title = name else: # Fallback to the first ten words of the message. title = " ".join(message[0].split(maxsplit=10)) + " ..." message = bleach.linkify("</p><p>".join(message)) il.add_value("content_html", "<p>{}</p>".format(message)) elif name: title = name else: title = link il.add_value("title", title) if link and name: il.add_value( "content_html", '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name), ) picture = entry.get("picture") if picture: il.add_value( "content_html", '<a href="{link}"><img src="{image}"></a>'.format( link=link, image=picture), ) il.add_value("path", response.meta["page_id"]) yield il.load_item()
def parse_item(self, response): remove_elems = ['h1', '.delayed-image-load'] change_tags = {'noscript': 'div'} il = FeedEntryItemLoader(response=response, parent=response.meta['il'], remove_elems=remove_elems, change_tags=change_tags, base_url='http://{}'.format(self.name)) il.add_xpath('content_html', '//div[@id="main-inner"]') yield il.load_item()
def _parse_article(self, response): remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url=f"https://{self.name}", remove_elems=remove_elems, ) il.add_css("content_html", "#page") return il.load_item()
def parse_letter(self, response): account = response.meta["account"] il = FeedEntryItemLoader(response=response, base_url=self._links.get(account)) il.add_value("path", account) il.add_value("link", response.url) il.add_css("title", "title::text") il.add_css("author_name", "div#message-heading div.by-line a::text") il.add_css("updated", "div#message-heading div.date::text") il.add_css("content_html", "div.message-body") yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna") il.add_xpath("title", "h1/text()") il.add_value("link", response.url) il.add_xpath("content_html", "h1/following-sibling::*") il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0]) il.add_value("author_name", self.name) return il.load_item()
def _parse_restaurant(self, response): il = FeedEntryItemLoader( response=response, base_url=response.url, parent=response.meta["il"], remove_elems=[".external"], ) il.add_css("content_html", ".content .right p") il.add_css("content_html", ".restaurant-link") il.add_css("category", ".tags a ::text") yield il.load_item()