def parse_item(self, response): remove_elems = [ 'aside', 'script', 'h1', '.breadcrumbs', '.author-date', '.artikel-social-kommentar', '.bild-copyright', '.ressortTitleMobile', '.article-number', '.artikel-kommentarlink', '.umfrage-wrapper', '.articleIssueInfo', ] il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='http://{}'.format(self.name), remove_elems=remove_elems) il.add_value('link', response.url) author_name = ( response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)') or 'Red.') il.add_value('author_name', author_name) il.add_css('title', 'h1[itemprop="headline"]::text') il.add_css('updated', 'meta[property="article:published_time"]::attr(content)', re='([^+]*)') il.add_css('content_html', 'article') yield il.load_item()
def parse_album(self, response): def _replace_track_info(elem): parts = list( map(lambda x: x.text_content().strip(), elem.getchildren())) return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1]) title = response.xpath('//h1[@class="c-product-block__title"]//text()' ).extract()[-1].strip() artist = response.xpath( '//div[contains(@class,"c-product-block__contributors")]/p/text()' ).re_first('[^,]+') il = FeedEntryItemLoader( response=response, base_url="https://{}/".format(self.name), remove_elems=[ '.c-product-block__title', '.c-product__product-purchase', '.c-track__format-specific-info', '.c-track__duration', '.c-track__details', '.c-tracklist__initial-tracks', '.c-tabs-block__tabs-links', 'button' ], replace_elems={'.c-track__all-format-info': _replace_track_info}) il.add_value("title", '{} - {}'.format(artist, title)) il.add_value("link", response.url) il.add_value("author_name", 'bot') il.add_css("content_html", 'div.c-page--product') return il.load_item()
def _parse_article(self, response): remove_elems = [ ".caption-credit", ".gallery-image-credit", "#social-left", "ul.toc", "h3:contains('Table of Contents')", "br", ".sidebar:contains('Further Reading')", ".credit", ] change_tags = {".sidebar": "blockquote", "aside": "blockquote"} replace_elems = {"div.image": self._div_to_img} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) if response.meta.get("first_page", False): il.add_value("link", response.url) il.add_css("author_name", ".byline a span ::text") il.add_css("content_html", "header h2") il.add_value("path", response.meta["path"]) il.add_css("content_html", ".article-content") if response.css(".next"): return scrapy.Request( response.css(".numbers a::attr(href)").extract()[-1], self._parse_article, meta={"il": il, "path": response.meta["path"]}, ) else: return il.load_item()
def parse(self, response): json_response = json.loads(response.text) if 'next' in json_response['_links']: yield Request(json_response['_links']['nextPage'], meta={'dont_cache': True}) for item in json_response['_embedded']['items']: il = FeedEntryItemLoader(response=response, timezone=self._timezone, dayfirst=False) il.add_value('title', item['title']) il.add_value( 'content_html', '<img src="{}">'.format(item['playlist']['preview_image_url'])) if item['description']: il.add_value('content_html', item['description'].replace('\r\n', '<br>')) il.add_value('updated', item['date']) il.add_value( 'link', item['url'].replace('api-tvthek.orf.at', 'tvthek.orf.at')) yield Request(item['_links']['profile']['href'], self._parse_profile, meta={'item': il}, dont_filter=True)
def parse_release_changelog(self, response): il = FeedEntryItemLoader(response=response, parent=response.meta["il"], base_url=self._base_url) il.add_value("content_html", "<h1>Detailed Changelog</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): if "data-original" in elem.attrib: elem.attrib["src"] = elem.attrib["data-original"] return elem remove_elems = [ ".credit", ".hide-caption", ".toggle-caption", ".enlarge-options", ".enlarge_measure", ".enlarge_html", ".ad-backstage", 'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")', 'p:contains("Did you enjoy this newsletter segment?")', ] replace_elems = {"img": _fix_img_src} change_tags = {".image": "figure", ".credit-caption": "figcaption"} il = FeedEntryItemLoader( response=response, base_url=self._base_url, remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) il.add_css("title", "h1 ::text") il.add_value("link", response.url) il.add_css("content_html", "#storytext") il.add_value("path", response.meta["path"]) il.add_css("updated", '.dateblock time::attr("datetime")') il.add_css("author_name", ".byline__name a::text") yield il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url=self._base_url ) il.add_value("content_html", "<h1>Detailed Changelog</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url='{}/cms/'.format(self._link), timezone=self._timezone, remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr', 'h7'], remove_elems_xpath=['//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]'], ) il.add_value( 'title', response.xpath('//head/title/text()').re_first(r'::: (.*)')) il.add_value('link', response.url) il.add_value( 'updated', response.xpath('//div[@class="news-single-rightbox"]'). re_first(r'(\d{2}\.\d{2}\.\d{4})')) il.add_value( 'author_name', response.xpath('//head/meta[@name="publisher"]/@content'). re_first('recht.at, (.*);')) il.add_xpath('author_name', '//head/meta[@name="author"]/@content') il.add_value('author_name', self.name) il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content') il.add_css('content_html', '.news-single-item h7 font strong') il.add_css('content_html', '.news-single-item') yield il.load_item()
def _parse_article(self, response): if response.status == 410: # Articles has been deleted. return remove_elems = [".bildtext .author", "iframe"] change_tags = {"h1": "h2", ".bildbox": "figure", ".bildtext": "figcaption"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, dayfirst=True, yearfirst=False, ) if response.css(".payment"): il.add_value("category", "paywalled") il.add_css("link", 'link[rel="canonical"]::attr(href)') il.add_css("title", 'meta[property="og:title"]::attr(content)') il.add_css("author_name", ".druckheadline::text", re=r"·\s*(.*)\s*·") # Mon, 01 Oct 18 13:42:45 +0200 il.add_css("updated", 'meta[http-equiv="last-modified"]::attr(content)') il.add_css("content_html", ".druckcontent") il.add_value("path", response.meta["ressort"]) return il.load_item()
def _parse_article(self, response): if response.status == 410: # Articles has been deleted. return remove_elems = [ '.bildtext .author', 'iframe', ] change_tags = { 'h1': 'h2' } il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, change_tags=change_tags, dayfirst=False, yearfirst=False) if response.css('.payment'): il.add_value('category', 'paywalled') il.add_css('link', 'link[rel="canonical"]::attr(href)') il.add_css('title', 'meta[property="og:title"]::attr(content)') il.add_css('author_name', '.druckheadline::text', re='·\s*(.*)\s*·') il.add_css('updated', 'meta[http-equiv="last-modified"]::attr(content)') il.add_css('content_html', '.druckcontent') il.add_value('path', response.meta['ressort']) yield il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader( response=response, parent=response.meta['il'], base_url=self._base_url, ) il.add_value('content_html', '<h1>Detailed Changelog</h1>') il.add_xpath('content_html', '//h1/following-sibling::*') yield il.load_item()
def parse_letter(self, response): account = response.meta["account"] il = FeedEntryItemLoader(response=response, base_url=self._links.get(account)) il.add_value("path", account) il.add_value("link", response.url) il.add_css("title", "title::text") il.add_css("author_name", "div#message-heading div.by-line a::text") il.add_css("updated", "div#message-heading div.date::text") il.add_css("content_html", "div.message-body") yield il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): if "src" not in elem.attrib: if "data-lazy-src" in elem.attrib: elem.attrib["src"] = elem.attrib["data-lazy-src"] elif "data-src" in elem.attrib: elem.attrib["src"] = elem.attrib["data-src"] return elem def _parse_breadcrumbs(breadcrumbs): links = breadcrumbs.css("a::text, a::attr('href')").extract() return {k[1:]: v for k, v in zip(links[::2], links[1::2])} breadcrumbs = _parse_breadcrumbs( response.css(".site-contextnavigation-breadcrumbs-nav a") ) self._titles = {**self._titles, **breadcrumbs} remove_elems = [ "ad-container", "figure > footer", "picture > button", "div[data-section-type='newsletter']", ".gallery-summary", ] change_tags = { ".article-subtitle": "strong", "aside": "blockquote", "p strong:only-child": "h3", } replace_elems = {"img": _fix_img_src} il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, replace_elems=replace_elems, timezone="Europe/Vienna", ) il.add_value("link", response.url) il.add_css("title", 'meta[property="og:title"]::attr(content)') if response.css(".article-origins .article-author-avatar"): # Blog posts. il.add_css("author_name", ".article-author-avatar > span ::text") else: # Normal articles. il.add_css("author_name", ".article-origins ::text") il.add_value("path", response.meta["ressort"]) il.add_value("category", breadcrumbs.values()) il.add_css("category", ".storylabels span ::text") il.add_css("updated", "time::attr('datetime')") il.add_css("content_html", ".article-subtitle") il.add_css("content_html", ".article-body") return il.load_item()
def parse_node(self, response, node): il = FeedEntryItemLoader(selector=node) url = node.xpath("link/text()").extract_first() il.add_value("link", url) il.add_xpath("updated", "pubDate/text()") il.add_xpath( "title", "title/text()", # Use re.DOTALL since some titles have newlines in them. re=re.compile("(?:Artikel|Tagebuch): (.*)", re.DOTALL), ) return scrapy.Request(url, self._parse_article, meta={"il": il})
def parse_node(self, response, node): url = node.xpath("rss:loc/text()").extract_first() il = FeedEntryItemLoader(selector=node) il.add_value("link", url) il.add_xpath("title", "news:news/news:title/text()") keywords = node.xpath("news:news/news:keywords/text()").extract_first() if keywords: il.add_value("category", keywords.split(", ")) il.add_xpath("updated", "news:news/news:publication_date/text()") return scrapy.Request( url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]} )
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/".format(self.feed_link), timezone="Europe/Vienna", dayfirst=True, remove_elems=[".ruler", "h1"], ) il.add_css("title", "h1.event-title::text") il.add_value("link", response.url) il.add_css("content_html", "div#content.container") return il.load_item()
def parse_item_text(self, response): remove_elems = [".ad-component", ".wp-caption-text"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, base_url="https://{}".format(self.name), ) if response.css(".bluebox"): il.add_value("category", "paywalled") il.add_css("content_html", "div.pR") return il.load_item()
def _parse_weekly_edition(self, response): remove_elems = ["h1"] change_tags = { ".Cat1HL": "h1", ".Cat2HL": "h2", ".Cat3HL": "h3", ".SummaryHL": "h4", } il = FeedEntryItemLoader( response=response, parent=response.meta["il"], change_tags=change_tags, remove_elems=remove_elems, base_url=f"https://{self.name}", ) for url in response.css("h2.SummaryHL a::attr(href)").extract(): yield scrapy.Request( response.urljoin(url), self._parse_article, meta={ "il": None, "updated": response.meta["updated"] }, ) # Remove articles that have their own page. text = [] in_article = False for line in response.css(".ArticleText").extract_first().splitlines( True): # Beginning of article. if '<h2 class="SummaryHL"><a href="/Articles/' in line: in_article = True if not in_article: text.append(line) # End of article. Note that the links to the comments doesn't # always include "#comments" so we can't check for that. if '">Comments (' in line: in_article = False text = "".join(text) # Remove page editor. text = re.sub(r"<b>Page editor</b>: .*", "", text) # Recursively remove headings with no content. text = _remove_empty_headings(text) il.add_css("title", "h1::text") il.add_value("content_html", text) il.add_value("link", response.url) yield il.load_item()
def parse_item_text(self, response): remove_elems = [ '.dachzeile', 'h1', '.meta', 'br', 'form', '.button-container' ] il = FeedEntryItemLoader(response=response, parent=response.meta['il'], remove_elems=remove_elems, base_url='http://{}'.format(self.name)) content = response.xpath('//article').extract_first() if 'Lesen Sie diesen Artikel in voller Länge' in content: il.add_value('category', 'paywalled') il.add_value('content_html', content) yield il.load_item()
def parse_item_text(self, response): remove_elems = [".dachzeile", "h1", ".meta", "br", "form", ".button-container"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, base_url="https://{}".format(self.name), ) content = response.xpath("//article").extract_first() if "Lesen Sie diesen Artikel in voller Länge" in content: il.add_value("category", "paywalled") il.add_value("content_html", content) return il.load_item()
def parse(self, response): m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text) restaurants = sorted( json.loads(m.group(1))["restaurants"]["entities"].values(), key=lambda r: int(r["created"]), reverse=True, ) for restaurant in restaurants[:20]: il = FeedEntryItemLoader(timezone="UTC", base_url=response.url) url = response.urljoin(restaurant["url"]) il.add_value("link", url) il.add_value("title", restaurant["name"]) content = """ <img src="{image}"> <ul> <li>{address}</li> <li>{price_range_human}</li> <li>{cuisine_text}</li> </ul> """ il.add_value("content_html", content.format(**restaurant)) il.add_value( "updated", datetime.utcfromtimestamp(int(restaurant["created"])) ) yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
def parse_release_notes(self, response): il = FeedEntryItemLoader( response=response, timezone="Europe/Berlin", base_url=self._base_url ) il.add_xpath("title", "//h1/text()") il.add_value("link", response.url) il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by") il.add_value("content_html", "<h1>Release Notes</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return scrapy.Request( response.url.replace("notes-", "changelog-"), self.parse_release_changelog, meta={"il": il}, )
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) try: response.text except AttributeError: # Response is not text (e.g. PDF, ...). il.add_value("title", feed_entry.get("title")) il.add_value("content_html", feed_entry.get("summary")) return il.load_item() doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def parse(self, response): m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text) restaurants = sorted( json.loads(m.group(1))["restaurants"]["entities"].values(), key=lambda r: int(r["created"]), reverse=True, ) for restaurant in restaurants[:20]: il = FeedEntryItemLoader(timezone="UTC", base_url=response.url) url = response.urljoin(restaurant["url"]) il.add_value("link", url) il.add_value("title", restaurant["name"]) content = """ <img src="{image}"> <ul> <li>{address}</li> <li>{price_range_human}</li> <li>{cuisine_text}</li> </ul> """ il.add_value("content_html", content.format(**restaurant)) il.add_value("updated", datetime.utcfromtimestamp(int(restaurant["created"]))) yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
def _parse_video_page(self, response): match = re.search( r"https?://(?:www\.)?servustv\.com/videos/(?P<id>[aA]{2}-\w+|\d+-\d+)", response.url, ) if not match: return video_id = match.group("id").upper() il = FeedEntryItemLoader(response=response) il.add_value("link", response.url) section = response.css( "meta[property='article:section']::attr('content')").extract_first( ) if section != "Allgemein": il.add_value("title", section) il.add_css("title", "title::text", re="(.*) - Servus TV") image_url = response.css( "meta[property='og:image']::attr('content')").extract_first() il.add_value("content_html", '<img src="{}">'.format(image_url)) il.add_css("content_html", "meta[property='og:description']::attr('content')") il.add_css("content_html", "#media-asset-content-container") match = re.search(r'"dateModified":\s*"([^"]+)"', response.text) if match: il.add_value("updated", match.group(1)) stream_url = "https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8" % video_id yield Request(stream_url, self._parse_stream, meta={"il": il})
def parse_release_notes(self, response): il = FeedEntryItemLoader( response=response, timezone="Europe/Berlin", base_url=self.feed_link, remove_elems=[".cookielaw-banner"], ) il.add_xpath("title", "//h1/text()") il.add_value("link", response.url) il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by") il.add_value("content_html", "<h1>Release Notes</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_node(self, response, node): url = node.xpath("rss:loc/text()").extract_first() il = FeedEntryItemLoader(selector=node) il.add_value("link", url) il.add_xpath("title", "news:news/news:title/text()") keywords = node.xpath("news:news/news:keywords/text()").extract_first() if keywords: il.add_value("category", keywords.split(", ")) il.add_xpath("updated", "news:news/news:publication_date/text()") return scrapy.Request(url, self.parse_item, meta={ "il": il, "handle_httpstatus_list": [404] })
def _parse_weekly_edition(self, response): remove_elems = ['h1'] change_tags = { '.Cat1HL': 'h1', '.Cat2HL': 'h2', '.Cat3HL': 'h3', '.SummaryHL': 'h4', } il = FeedEntryItemLoader(response=response, parent=response.meta['il'], change_tags=change_tags, remove_elems=remove_elems, base_url='https://{}'.format(self.name)) for url in response.css('h2.SummaryHL a::attr(href)').extract(): yield scrapy.Request(response.urljoin(url), self._parse_article, meta={ 'il': None, 'updated': response.meta['updated'] }) # Remove articles that have their own page. text = [] in_article = False for line in ( response.css('.ArticleText').extract_first().splitlines(True)): # Beginning of article. if '<h2 class="SummaryHL"><a href="/Articles/' in line: in_article = True if not in_article: text.append(line) # End of article. Note that the links to the comments doesn't # always include "#comments" so we can't check for that. if '">Comments (' in line: in_article = False text = ''.join(text) # Remove page editor. text = re.sub(r'<b>Page editor</b>: .*', '', text) # Recursively remove headings with no content. text = _remove_empty_headings(text) il.add_css('title', 'h1::text') il.add_value('content_html', text) il.add_value('link', response.url) yield il.load_item()
def _parse_article(self, response): remove_elems = [ '.projectNav', 'h1', '.socialMedia__headline', '.whyRead', '.overlayCTA', '.authors', '.socialMedia', '.sidebar', '.sectionBackground--colorTheme1', '.heroStage__copyright', '.heroStage__downLink', 'script', 'iframe', '.image__zoom ', '.image__copyrightWrapper', '.callToAction', '.print-action', '.internalLink span', ] change_tags = { 'div.heroStage__introText': 'strong', 'figcaption': 'i', 'figure': 'div' } replace_regex = { r'<span data-src="([^"]+)"></span>.*?' + r'<span data-src="([^"]+)" data-min-width="1000">': r'<a href="\2"><img src="\1"></a>', r'<div style=".*?"><video.*?></video>.*?</div></div>': '<em>Das eingebettete Video ist nur im Artikel verfügbar.</em>', } il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, change_tags=change_tags, replace_regex=replace_regex) il.add_value('link', response.url) il.add_value('author_name', 'Addendum') il.add_css('title', 'meta[property="og:title"]::attr(content)') il.add_css('updated', 'meta[property="article:modified_time"]::attr(content)') # If not yet modified: il.add_css('updated', 'meta[property="article:published_time"]::attr(content)') il.add_css('content_html', '.content') yield il.load_item()
def parse_release_notes(self, response): il = FeedEntryItemLoader(response=response, timezone="Europe/Berlin", base_url=self._base_url) il.add_xpath("title", "//h1/text()") il.add_value("link", response.url) il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by") il.add_value("content_html", "<h1>Release Notes</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return scrapy.Request( response.url.replace("notes-", "changelog-"), self.parse_release_changelog, meta={"il": il}, )
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def parse_release_notes(self, response): il = FeedEntryItemLoader( response=response, timezone=self._timezone, base_url=self._base_url, ) il.add_xpath('title', '//h1/text()') il.add_value('link', response.url) il.add_xpath('updated', '//div[@class="docInfo"]', re='Last modified: (.*) by') il.add_value('content_html', '<h1>Release Notes</h1>') il.add_xpath('content_html', '//h1/following-sibling::*') yield scrapy.Request(response.url.replace('notes-', 'changelog-'), self.parse_release_changelog, meta={'il': il})
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value( "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})") ) il.add_css("content_html", ".Content") return il.load_item()
def _parse_article(self, response): remove_elems = [ ".noprint", "form", "font[size='3'] > b", "font[size='2'] > b:first-child", 'a[href="mailto:[email protected]"]', "br:first-child", "br:first-child", "br:first-child", "br:first-child", "br:first-child", "br:first-child", "br:last-child", "br:last-child", "br:last-child", "br:last-child", "br:last-child", "br:last-child", ] replace_regex = { r"\[\d{2}\.\d{2}\.\d{4}\]": "", # A0 is a non-breaking space in latin1. "\xA0": "", r"<br>\s*<br>\s*\d{1,2}\.\d{1,2}\.\d{4}\s*<br>": "", } change_attribs = {"font": {"size": None, "face": None, "color": None}} change_tags = {"font": "div", "center": "div"} il = FeedEntryItemLoader( response=response, base_url=response.url, remove_elems=remove_elems, replace_regex=replace_regex, change_attribs=change_attribs, change_tags=change_tags, parent=response.meta["il"], ) il.add_css("author_name", ".sidebar .authors__name::text") if response.css(".printwidth2"): il.add_css("content_html", ".printwidth2") else: # Tagebuch il.add_css("content_html", ".lineall") il.add_value("category", "Tagebuch") return il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): if "data-src" in elem.attrib: elem.attrib["src"] = elem.attrib["data-src"] return elem if response.status == 410: # Articles has been deleted. return remove_elems = [ ".artDetail__header__container", ".artDetail__extImage__copyright", "#readspeaker_button1", ".artDetail__userOptions", ".container__col--hide", ".container__col--mdHide", ".artDetailMeineThemen__outer", ".artDetailAutor__outer", ".artDetailMehrZu", "div[style='display: none;']", ".artDetail__ooenplusOverlay", ] replace_elems = {"img": _fix_img_src} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, replace_elems=replace_elems, dayfirst=True, yearfirst=False, ) if response.css(".mainLogin__linkToggle"): il.add_value("category", "paywalled") il.add_css("link", 'link[rel="canonical"]::attr(href)') il.add_css("title", 'meta[property="og:title"]::attr(content)') il.add_css("author_name", ".artDetailAutor__headline::text") # Mon, 01 Oct 18 13:42:45 +0200 il.add_css("updated", 'meta[name="date"]::attr(content)') il.add_css("content_html", "article.artDetail") il.add_css("category", ".artDetailOrt__linkText::text") il.add_value("path", response.meta["ressort"]) return il.load_item()
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value("updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")) il.add_css("content_html", ".Content") return il.load_item()
def parse_item(self, response): if response.status == 404: self.logger.info("Article '{}' not available anymore.".format( response.url)) return def _clean_caption(elem): if "–" in elem.text: # Caption is of the format "text - credit". elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text) return elem else: # It's just the "credit", remove it. return None section = response.css('meta[name="kt:section-path"]::attr("content")' ).extract_first()[1:] # Skip the first /. if section not in self._sections and "all" not in self._sections: # Ignore the response as the ressort should not be parsed. return il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=[ ".ad", ".article-paid", ".js-overlay-close", ".swiper-lazy-preloader", ], change_tags={".article__lead": "strong"}, change_attribs={".zoomable__image--zoomed": { "data-src": "src" }}, replace_elems={".article__media-caption": _clean_caption}, base_url="https://www.{}".format(self.name), ) il.add_css( "author_name", "article .article__author ::text", re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL), ) il.add_css("content_html", "article .article__media .zoomable__inner") il.add_css("content_html", "article .article__lead") # change tags to strong il.add_css("content_html", "article .article__body") if response.css(".article-paid"): il.add_value("category", "paywalled") il.add_value("category", section.split("/")) if "all" in self._sections: il.add_value("path", "all") if section in self._sections: il.add_value("path", section) return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@id="maincontentbook"]'), base_url=self.feed_link, ) il.add_xpath("title", '//h1[@class="p_book_title"]/text()') il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()') il.add_value("link", response.url) il.add_value("author_name", self.feed_title) il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()') il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()') il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()') il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()') il.add_xpath("content_html", '//div[@class="bookcontent"]//text()') il.add_xpath("content_html", '//div[@class="p_book_image"]/img') il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()') return il.load_item()
def parse_item(self, response): if response.status == 404: self.logger.info("Article '{}' not available anymore.".format(response.url)) return def _clean_caption(elem): if "–" in elem.text: # Caption is of the format "text - credit". elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text) return elem else: # It's just the "credit", remove it. return None section = response.css( 'meta[name="kt:section-path"]::attr("content")' ).extract_first()[ 1: ] # Skip the first /. if section not in self._sections and "all" not in self._sections: # Ignore the response as the ressort should not be parsed. return il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=[ ".ad", ".article-paid", ".js-overlay-close", ".swiper-lazy-preloader", ], change_tags={".article__lead": "strong"}, change_attribs={".zoomable__image--zoomed": {"data-src": "src"}}, replace_elems={".article__media-caption": _clean_caption}, base_url="https://www.{}".format(self.name), ) il.add_css( "author_name", "article .article__author ::text", re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL), ) il.add_css("content_html", "article .article__media .zoomable__inner") il.add_css("content_html", "article .article__lead") # change tags to strong il.add_css("content_html", "article .article__body") if response.css(".article-paid"): il.add_value("category", "paywalled") il.add_value("category", section.split("/")) if "all" in self._sections: il.add_value("path", "all") if section in self._sections: il.add_value("path", section) return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/cms/".format(self.feed_link), timezone="Europe/Vienna", remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"], remove_elems_xpath=[ '//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]', ], dayfirst=True, ) il.add_value( "title", response.xpath("//head/title/text()").re_first(r"::: (.*)") ) il.add_value("link", response.url) il.add_value( "updated", response.xpath('//div[@class="news-single-rightbox"]').re_first( r"(\d{2}\.\d{2}\.\d{4})" ), ) il.add_value( "author_name", response.xpath('//head/meta[@name="publisher"]/@content').re_first( "recht.at, (.*);" ), ) il.add_xpath("author_name", '//head/meta[@name="author"]/@content') il.add_value("author_name", self.name) il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content') il.add_css("content_html", ".news-single-item h7 font strong") il.add_css("content_html", ".news-single-item") return il.load_item()
def parse_node(self, response, node): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), dayfirst=True ) il.add_value("updated", node.xpath("//pubDate/text()").extract_first()) il.add_value("author_name", node.xpath("//dc:creator/text()").extract_first()) il.add_value("category", node.xpath("//category/text()").extract()) title = node.xpath("(//title)[2]/text()").extract() if not title: # Fallback to the first category if no title is provided (e.g. comic). title = node.xpath("//category/text()").extract_first() il.add_value("title", title) link = node.xpath("(//link)[2]/text()").extract_first() il.add_value("link", link) if self._steady_token: cookies = {"steady-token": self._steady_token} else: cookies = None return scrapy.Request( link, self._parse_article, cookies=cookies, meta={"il": il} )
def _parse_interview(self, response): remove_elems = [ ".shareable-quote", ".share-bar", # Remove the last two h2s and all paragraphs below. ".interview-body > h2:last-of-type ~ p", ".interview-body > h2:last-of-type", ".interview-body > h2:last-of-type ~ p", ".interview-body > h2:last-of-type", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_css("title", "h1::text") il.add_css("author_name", "header .user-link__name::text") il.add_css("content_html", ".interview-body") il.add_value("updated", response.meta["updated"]) return il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "title", '//meta[@name="title"]/@content', re=r"(?s)(.*?)(?: vom .*)? - puls4\.com", ) il.add_value( "updated", "{} {}".format( response.xpath('//meta[@name="title"]/@content').re_first( r".*vom (\d{2}\.\d{2}\.\d{4}).*" ), response.meta["time"] or "00:00", ), ) il.add_value( "content_html", '<img src="{}">'.format( response.xpath('//meta[@property="og:image"]/@content').extract_first() ), ) il.add_css("content_html", ".player-video-description-intro::text") return il.load_item()
def parse_item(self, response): remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) author_name = ( response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red." ) il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader(response=response, base_url=self._base_url) il.add_value("updated", response.meta["updated"]) il.add_value("author_name", response.meta["author_name"]) il.add_value("link", response.url) il.add_css("title", "title::text", re="(.*) - The Oatmeal") il.add_value("category", urlsplit(response.url).path.strip("/").split("/")[0]) # comics il.add_css("content_html", "#comic > img") il.add_css("content_html", "#comic > p > img") # blog il.add_css("content_html", "#blog .center_text img") return il.load_item()
def parse_archive_search(self, response): for i, item in enumerate(json.loads(response.text)["result"]["hits"]): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", ) il.add_value("path", "magazine") link = response.urljoin(item["detail_link"]) il.add_value("link", link) try: author = re.sub( r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title() ) il.add_value("author_name", author) except IndexError: pass il.add_value("title", item["title"]) # All articles have the same date. # We add an offset so they are sorted in the right order. date = response.meta["issue_date"] + timedelta(seconds=i) il.add_value("updated", date) yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
def _parse_user_profile(self, response): self._users[response.meta["user_id"]] = ( response.css("#up_user h2::text").extract_first().strip() ) for posting in response.css(".posting"): il = FeedEntryItemLoader( selector=posting, base_url="https://{}".format(self.name), change_tags={"span": "p"}, ) il.add_css("title", ".text strong::text") il.add_css("link", '.text a::attr("href")') il.add_value( "updated", datetime.utcfromtimestamp( int(posting.css('.date::attr("data-timestamp")').extract_first()) / 1000 ), ) il.add_css("content_html", ".text span") il.add_css("content_html", ".article h4") il.add_value("path", response.meta["path"]) yield il.load_item()
def parse_content(self, response): parts = self._extract_parts(response) il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", dayfirst=True ) il.add_value("path", self._library) il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)])) il.add_value("link", response.url) il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)") _content = ["<ul>"] for part in parts: _content.append("<li>{}</li>".format(part)) _content.append("</ul>") il.add_value("content_html", "".join(_content)) return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna" ) il.add_xpath("title", "h1/text()") il.add_value("link", response.url) il.add_xpath("content_html", "h1/following-sibling::*") il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0]) il.add_value("author_name", self.name) return il.load_item()
def parse_node(self, response, node): link = node.xpath("link/text()").extract_first() il = FeedEntryItemLoader() il.add_value("title", node.xpath("title/text()").extract_first()) il.add_value("updated", node.xpath("pubDate/text()").extract_first()) il.add_value("category", node.xpath("category/text()").extract()) return scrapy.Request( link, self._parse_article, cookies={"view": "mobile"}, meta={"il": il, "path": response.meta["path"], "first_page": True}, )
def parse(self, response): mitteilungsblaetter = response.css(".mitteilungsblaetter") updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})") link = response.urljoin( mitteilungsblaetter.css('a::attr("href")').extract_first() ) response = yield scrapy.Request(link, method="HEAD") mb_url = response.url match = re.search( r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url ) if not match: self.logger.error("No Mitteilungsblätter found!") return else: mb_id = match.group(1) url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id) response = yield scrapy.Request(url) last_entry = None for entry in reversed(json.loads(response.text)["knoten"]): (entry["main"], entry["sub"]) = re.match( r"(\d+)\.?(\d*)", entry["counter"] ).groups() if last_entry is not None and last_entry["main"] == entry["main"]: entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"]) entry["inhalt"] += last_entry["inhalt"] if entry["sub"] == "": il = FeedEntryItemLoader( base_url="https://tiss.{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("updated", updated) il.add_value("link", mb_url + "#{}".format(entry["counter"])) il.add_value("title", entry["titel"]) il.add_value("content_html", entry["inhalt"]) yield il.load_item() last_entry = None else: last_entry = entry
def parse(self, response): if len(response.css(".thumbnail")) == 0: self.logger.info("No items found.") return for item in response.css(".thumbnail"): il = FeedEntryItemLoader(selector=item, base_url=self._base_url) il.add_css("title", ".item_brand_text ::text") il.add_css("title", ".item-title ::text") il.add_css("title", ".current-price ::text") il.add_value( "link", response.urljoin(item.css(".item-link::attr(href)").extract_first()), ) image_url = item.css(".item-image::attr(data-bg)").re_first( r"url\(([^)]+)\)" ) # Fix broken images. if image_url.startswith("https://markenankauf.momox.de/pics/https://"): image_url = image_url.replace( "https://markenankauf.momox.de/pics/https://", "https://" ) il.add_value("content_html", '<img src="{}">'.format(image_url)) il.add_css("content_html", ".item-des-container") il.add_value("path", response.meta["path"]) yield il.load_item() page = int(response.css(".pagination .active a::text").extract_first()) if page == 1: yield generate_feed_header( title=response.css("title ::text").re_first( "(ubup | .*) Second Hand kaufen" ), subtitle="Deutschlands größter Second Hand-Onlineshop für " "Mode & Accessoires", icon="https://www.{}/images/favicon.ico".format(self.name), link=response.url, path=response.meta["path"], ) if page < self._scrape_pages: next_page = response.css( ".pagination .active + li a::attr(href)" ).extract_first() if next_page: yield scrapy.Request( response.urljoin(next_page), meta={"dont_cache": True, "path": response.meta["path"]}, )
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css(".block-news-item"): il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", ignoretz=True, base_url="https://www.{}".format(self.name), ) link = response.urljoin(item.css("a::attr(href)").extract_first()) il.add_value("link", link) il.add_value("title", item.css("h3::text").extract_first()) il.add_value("updated", item.css(".date::text").extract_first()) yield scrapy.Request(link, self.parse_item, meta={"il": il})
def parse_program(self, response): if not response.css(r".jsb_video\/FlashPlayer"): return data = json.loads( response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0] ) data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", data["clipurl"]) il.add_value("title", data["programname"]) il.add_value("updated", data["airdate"]) il.add_xpath("content_html", '//p[@class="plot_summary"]') item = il.load_item() # Only include videos posted in the last 7 days. if item["updated"] + self._timerange > datetime.now(timezone.utc): return item
def _parse_article_url(self, response): if not response.css("#content"): raise DropResponse( "Skipping {} since it is empty".format(response.url), transient=True ) if "Fehler" in response.css("h2 ::text").extract_first(): raise DropResponse( "Skipping {} since it returned an error".format(response.url), transient=True, ) remove_elems = ['div[style="padding-top:10px;"]'] il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://{}".format(self.name), dayfirst=True, remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_value("author_name", "VKI") date = response.css(".issue").re_first( r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})" ) il.add_value("updated", date) url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first( r"window\.open\('(.*)'\);" ) il.add_css("title", "h1::text") if url: return scrapy.Request( response.urljoin(url), callback=self._parse_article, meta={"il": il} ) else: il.add_value("category", "paywalled") il.add_css("content_html", ".primary") il.add_css("content_html", 'div[style="padding-top:10px;"] > h3') return il.load_item()
def parse(self, response): articles = json.loads(response.text) remove_elems = [ "hr + p", "hr", "iframe", "p i:last-of-type:contains('Facebook'):contains('Twitter')", ] for article in articles: il = FeedEntryItemLoader(timezone="UTC", remove_elems=remove_elems) il.add_value("title", article["title"]) il.add_value("link", article["url"]) if "thumbnail_url_1_1" in article: il.add_value( "content_html", '<img src="{}">'.format(article["thumbnail_url_1_1"]), ) il.add_value("content_html", article["body"]) il.add_value( "updated", datetime.utcfromtimestamp(article["publish_date"] / 1000) ) il.add_value( "author_name", [ contribution["contributor"]["full_name"] for contribution in article["contributions"] ], ) il.add_value("category", article["channel"]["name"]) for topic in article["topics"] + [article["primary_topic"]]: if topic and "name" in topic: il.add_value("category", topic["name"].title()) if article["nsfw"]: il.add_value("category", "nsfw") if article["nsfb"]: il.add_value("category", "nsfb") il.add_value("path", response.meta["locale"]) yield il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): src = elem.attrib.pop("data-zoom-src", None) # data-zoom-src is only valid if it starts with //images.derstandard.at. if src and src.startswith("//images.derstandard.at"): elem.attrib["src"] = src elem.attrib.pop("width", None) elem.attrib.pop("height", None) elem.attrib.pop("class", None) return elem remove_elems = [ ".credits", ".owner-info", ".image-zoom", ".continue", ".sequence-number", ".js-embed-output", "#mycountrytalks-embed", # Remove self-promotion for (other) ressorts. '.js-embed-output-feeds a[href^="/r"]', '.js-embed-output-feeds a[href^="https://derstandard.at/"]', ( ".js-embed-output-feeds " + 'img[src="https://images.derstandard.at/2018/10/18/' + 'Immobiliensuche202x122.png"]' ), ] change_tags = { "#media-list li .description": "figcaption", "#media-list li": "figure", "#media-list": "div", ".photo": "figure", ".caption": "figcaption", } replace_elems = { ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur " + "im Artikel verfügbar.</em></p>", # Replace every special script container with its unescaped content. "script.js-embed-template": lambda elem: ( '<div class="js-embed-output-feeds">' + html.unescape(elem.text or "") + "</div>" ), "img": _fix_img_src, } il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, replace_elems=replace_elems, ) il.add_value("link", response.url) il.add_css("title", 'meta[property="og:title"]::attr(content)') for author in response.css("span.author::text").extract(): # Sometimes the author name is messed up and written in upper case. # This happens usually for articles written by Günter Traxler. if author.upper() == author: author = author.title() il.add_value("author_name", author) il.add_value("path", response.meta["ressort"]) il.add_value("updated", response.meta["updated"]) il.add_css("category", "#breadcrumb .item a::text") blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first() if blog_id: url = ( "https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}" ).format(self.name, blog_id) return scrapy.Request(url, self._parse_blog_article, meta={"il": il}) elif response.css("#feature-content"): cover_photo = response.css("#feature-cover-photo::attr(style)").re_first( r"\((.*)\)" ) il.add_value("content_html", '<img src="{}">'.format(cover_photo)) il.add_css("content_html", "#feature-cover-title h2") il.add_css("content_html", "#feature-content > .copytext") return il.load_item() else: il.add_css("content_html", "#content-aside") il.add_css("content_html", "#objectContent > .copytext") il.add_css("content_html", "#content-main > .copytext") il.add_css("content_html", ".slide") return il.load_item()