class InterfaxSpider(NewsSpider): name = "interfax" start_urls = ["https://www.interfax.ru/news/2008/02/11"] config = NewsSpiderConfig( title_path='//h1/text()', date_path= '//div[contains(@class, "tMC_head")]/meta[contains(@itemprop, "datePublished")]/@content', date_format="%Y-%m-%dT%H:%M:%S", text_path='//article//text()', topics_path='//div[contains(@class, "textML")]/a/text()') def parse(self, response): today = datetime.datetime.today() first_day = datetime.datetime(year=2008, month=2, day=11) date_range = [ first_day + datetime.timedelta(days=x) for x in range((today - first_day).days) ] for date in date_range: url = "https://www.interfax.ru/news/" + date.strftime("%Y/%m/%d") yield response.follow(url, self.parse_page) def parse_page(self, response): url = response.url page = int(url.split("page_")[-1]) if "page_" in url else 0 for page_href in response.xpath( '//div[contains(@class, "pages")]/a/@href').extract(): if page != 0: continue yield response.follow(page_href, self.parse_page) for document_href in response.xpath( '//div[contains(@class, "an")]/div/a/@href').extract(): yield response.follow(document_href, self.parse_document)
class TvZvezdaSpider(NewsSpider): name = "tvzvezda" start_urls = ["https://tvzvezda.ru/news"] config = NewsSpiderConfig( title_path='//h1/text()', date_path='//div[contains(@class, "date_news")]//text()', date_format="%H:%M %d.%m.%Y", text_path='//div[contains(@class, "glav_text")]//text()', topics_path='//meta[contains(@property, "article:section")]/@content', authors_path='//div[contains(@class, "autor_news")]/a/text()', reposts_fb_path='_', reposts_vk_path='_', reposts_ok_path='_', reposts_twi_path='_', reposts_lj_path='_', reposts_tg_path='_', likes_path='_', views_path='_', comm_count_path='_' ) news_le = LinkExtractor(restrict_css='div.js-ajax-receiver a.news_one') z=0 visited_urls = [] def parse(self, response): if response.url not in self.visited_urls: for link in self.news_le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_document) next_pages = response.xpath('//a[contains(@class, "all_news js-ajax-call")]/@href').extract() next_pages=next_pages[-1] new_url='20/'+str(self.z)+'/?_=1542171175300' self.z+=20 yield response.follow(next_pages+new_url, callback=self.parse)
class RiaSpider(NewsSpider): name = 'ria' start_urls = ['https://www.ria.ru'] config = NewsSpiderConfig( title_path='//h1[contains(@class, "article__title")]/text()', date_path='//div[contains(@class, "endless__item")]/@data-published', date_format='%Y-%m-%dT%H:%MZ', text_path='//div[contains(@class, "article__block") and @data-type = "text"]//text()', topics_path='//a[contains(@class, "article__tags-item")]/text()', authors_path='_' ) news_le = LinkExtractor(restrict_css='div.lenta__item') def parse(self, response): article_links = self.news_le.extract_links(response) last_link = '' for link in article_links: last_link = link.url yield scrapy.Request(url=link.url, callback=self.parse_document) dt = self._get_last_dt_on_page(last_link) if datetime.strptime(dt, self.config.date_format).date() >= self.until_date: # Getting and forming the next page link next_page_link = response.xpath('//div[contains(@class, "lenta__item")]/@data-next').extract()[0] link_url = '{}{}'.format(self.start_urls[0], next_page_link) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1} ) def parse_document(self, response): for res in super().parse_document(response): # Leave only the last tag # (the last tag is always a global website tag) res['topics'] = [res['topics'][-1]] yield res def _get_last_dt_on_page(self, link): r = requests.get(link) source_code = r.text root = lxml.html.fromstring(source_code) dt = root.xpath(self.config.date_path)[0] return dt
class RbcSpider(NewsSpider): name = 'rbc' link_tmpl = 'https://www.rbc.ru/v10/ajax/get-news-feed/project/rbcnews/lastDate/{}/limit/22' start_urls = [link_tmpl.format(int(time.time()))] config = NewsSpiderConfig( title_path='(.//span[contains(@class, "js-slide-title")])[1]//text()', date_path='_', date_format='%Y-%m-%d %H:%M:%S', text_path='(.//div[contains(@class, "article__text")])' '/*[not(self::script) and not(self::div[@class="subscribe-infographic"])]//text()', topics_path='(.//a[contains(@class, "article__header__category")])[1]//text()', authors_path='//div[contains(@class, "article__authors")]/text()' ) def parse(self, response): items = json.loads(response.body.decode('utf-8'))['items'] pub_dt = None for i in items: resp = HtmlResponse(url='', body=i['html'], encoding='utf8') link = resp.xpath('//a/@href').extract()[0] pub_dt = datetime.fromtimestamp(i['publish_date_t']) if pub_dt.date() >= self.until_date: yield scrapy.Request(url=link, callback=self.parse_document, meta={"pub_dt": pub_dt}) # Requesting page if publication date of the last article is above "until_date" if pub_dt and pub_dt.date() >= self.until_date: # Forming the next page link link_url = self.link_tmpl.format(int(pub_dt.timestamp())) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1} ) def parse_document(self, response): for res in super().parse_document(response): res['date'] = [response.meta["pub_dt"].strftime(self.config.date_format)] # If the article is located in "www.rbc.ru" url, then return it # (not "sportrbc.ru", "delovtb.rbc.ru" e t.c. because they have another html layout) if res['edition'][0] == '-': res['authors'] = [i.replace('\n', '').strip() for i in res['authors'] if i.replace('\n', '').strip()] res['text'] = [i.replace('\xa0', ' ') for i in res['text']] yield res
class InterfaxSpider(NewsSpider): name = "interfax" start_urls = [ "https://www.interfax.ru/news/{}".format( datetime.datetime.today().strftime("%Y/%m/%d")) ] config = NewsSpiderConfig( title_path='//h1[contains(@itemprop, "headline")]/text()', date_path='//meta[contains(@property, "published_time")]/@content', date_format="%Y-%m-%dT%H:%M%z", text_path= '//article[contains(@itemprop, "articleBody")]/p[not(contains(@itemprop, "author"))]//text()', topics_path='//aside[contains(@class, "textML")]/a//text()', authors_path='//p[contains(@itemprop, "author")]//text()', reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path="_", ) def parse(self, response): page_date = datetime.datetime.today().date() while page_date >= self.until_date: url = "https://www.interfax.ru/news/" + page_date.strftime( "%Y/%m/%d") yield response.follow(url, self.parse_page) page_date -= datetime.timedelta(days=1) def parse_page(self, response): url = response.url page = int(url.split("page_")[-1]) if "page_" in url else 0 for page_href in response.xpath( '//div[contains(@class, "pages")]/a/@href').extract(): if page != 0: continue yield response.follow(page_href, self.parse_page) for document_href in response.xpath( '//div[contains(@class, "an")]/div/a/@href').extract(): yield response.follow(document_href, self.parse_document)
class GazetaSpider(NewsSpider): name = 'gazeta' start_urls = ['https://www.gazeta.ru/news/'] config = NewsSpiderConfig( title_path='//h1/text()', date_path='//time[contains(@class, "date_time red")]/text()', date_format='%d.%m.%Y | %H:%M', text_path='//div[contains(@class, "article-text-body")]//text()', topics_path='//div[contains(@class, "active")]/a/span/text()') news_le = LinkExtractor(restrict_css='div.article_text h1.txt_2b') max_page_depth = 4 def parse(self, response): if response.meta.get('page_depth', 1) < self.max_page_depth: # Get last article datetime on the current page last_page_dt = response.xpath( '//time[contains(@class, "txtclear")]/@datetime').extract()[-1] # Convert it to datetime without timezone part last_page_dt = datetime.strptime(last_page_dt[:-6], '%Y-%m-%dT%H:%M:%S') # Forming the next page link link_url = '{}?p=page&d={}'.format( self.start_urls[0], last_page_dt.strftime('%d.%m.%Y_%H:%M')) yield scrapy.Request( url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1}) for link in self.news_le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_document) def parse_document(self, response): for res in super().parse_document(response): # Remove advertisement blocks ad_parts = ('\nРеклама\n', '\n.AdCentre_new_adv', ' AdfProxy.ssp', '\nset_resizeblock_handler') res['text'] = [ x for x in res['text'] if x != '\n' and not x.startswith(ad_parts) ] yield res
class RiaSpider(NewsSpider): name = 'ria' start_urls = ['https://www.ria.ru'] config = NewsSpiderConfig( title_path='//h1[contains(@class, "article__title")]/text()', date_path='//div[contains(@class, "endless__item")]/@data-published', date_format='%Y-%m-%dT%H:%MZ', text_path= '//div[contains(@class, "article__block") and @data-type = "text"]//text()', topics_path='//a[contains(@class, "article__tags-item")]/text()') news_le = LinkExtractor(restrict_css='div.lenta__item') max_page_depth = 4 def parse(self, response): article_links = self.news_le.extract_links(response) if response.meta.get('page_depth', 1) < self.max_page_depth: # Getting and forming the next page link next_page_link = response.xpath( '//div[contains(@class, "lenta__item")]/@data-next').extract( )[0] link_url = '{}{}'.format(self.start_urls[0], next_page_link) yield scrapy.Request( url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1}) for link in article_links: yield scrapy.Request(url=link.url, callback=self.parse_document) def parse_document(self, response): for res in super().parse_document(response): # Leave only the last tag # (the last tag is always a global website tag) res['topics'] = [res['topics'][-1]] yield res
class RussiaTodaySpider(NewsSpider): name = "rt" start_urls = ["https://russian.rt.com/news"] config = NewsSpiderConfig( title_path='//h1/text()', date_path='//meta[contains(@name, "mediator_published_time")]/@content', date_format="%Y-%m-%dT%H:%M:%S", text_path='//div[contains(@class, "article__text")]//text()', topics_path='//meta[contains(@name, "mediator_theme")]/@content', authors_path='_') news_le = LinkExtractor(restrict_css='div.listing__card div.card__heading') page_le = LinkExtractor( restrict_css='div.listing__button.listing__button_js', tags=['div'], attrs=['data-href']) max_page_depth = 4 def parse(self, response): if response.meta.get("page_depth", 1) < self.max_page_depth: for link in self.page_le.extract_links(response): yield scrapy.Request(url=link.url, priority=100, callback=self.parse, meta={ "page_depth": response.meta.get("page_depth", 1) + 1 }) for link in self.news_le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_document) def parse_document(self, response): for res in super().parse_document(response): if isinstance(res, Document): if isinstance(res["date"], list): res["date"] = [x[:-6] for x in res["date"] if x] else: res["date"] = res["date"][:-6] yield res
class IzSpider(NewsSpider): name = "iz" start_urls = ["https://iz.ru/feed"] config = NewsSpiderConfig( title_path='//h1/span/text()', date_path= '//div[contains(@class, "article_page__left__top__time__label")]/div/time/@datetime', date_format="%Y-%m-%dT%H:%M:%SZ", text_path='//div[contains(@itemprop, "articleBody")]/div/p//text()', topics_path='//div[contains(@class, "rubrics_btn")]/div/a/text()') visited_urls = [] def parse(self, response): if response.url not in self.visited_urls: for link in response.xpath( '//div[@class="lenta_news__day"]/div/a/@href').extract(): url = urljoin(response.url, link) yield scrapy.Request(url=url, callback=self.parse_document) next_pages = response.xpath( '//a[contains(@class, "button")]/@href').extract() next_pages = next_pages[-1] yield response.follow(next_pages, callback=self.parse)
class InterfaxSpider(NewsSpider): name = "interfax" start_urls = [ "https://www.interfax.ru/news/{}".format( datetime.datetime.today().strftime("%Y/%m/%d")) ] config = NewsSpiderConfig( title_path='//h1/text()', date_path= '//div[contains(@class, "tMC_head")]/meta[contains(@itemprop, "datePublished")]/@content', date_format="%Y-%m-%dT%H:%M:%S", text_path='//article//text()', topics_path='//div[contains(@class, "textML")]/a/text()', authors_path='_') def parse(self, response): page_date = datetime.datetime.today().date() while page_date >= self.until_date: url = "https://www.interfax.ru/news/" + page_date.strftime( "%Y/%m/%d") yield response.follow(url, self.parse_page) page_date -= datetime.timedelta(days=1) def parse_page(self, response): url = response.url page = int(url.split("page_")[-1]) if "page_" in url else 0 for page_href in response.xpath( '//div[contains(@class, "pages")]/a/@href').extract(): if page != 0: continue yield response.follow(page_href, self.parse_page) for document_href in response.xpath( '//div[contains(@class, "an")]/div/a/@href').extract(): yield response.follow(document_href, self.parse_document)
class MeduzaSpider(NewsSpider): name = 'meduza' page_link_tmpl = 'https://meduza.io/api/v3/search?chrono=news&page={}&per_page=24&locale=ru' article_link_tmpl = 'https://meduza.io/{}' start_urls = [page_link_tmpl.format(0)] months_ru = [ 'января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря', ] fields = [ 'title', 'topics', 'authors', 'edition', 'url', 'text', 'date', ] config = NewsSpiderConfig( title_path= '//h1[@class="RichTitle-root" or @class="SimpleTitle-root" or ' + '@class="RichTitle-root RichTitle-slide"]//text()', date_path='//time[@class="Timestamp-root"]/text()', date_format='%H:%M, %d %m %Y', text_path= '//div[@class="GeneralMaterial-article" or @class="SlidesMaterial-layout" ' + 'or @class="MediaCaption-caption"]//p//text() | //div[@class="MediaCaption-caption"]//text() | ' + '//p[@class="SimpleBlock-p" or @class="SimpleBlock-lead"]//text()', topics_path='_', authors_path='_', reposts_fb_path='_', reposts_vk_path='_', reposts_ok_path='_', reposts_twi_path='_', reposts_lj_path='_', reposts_tg_path='_', likes_path='_', views_path='_', comm_count_path='_', ) def parse(self, response): last_page = False jsonresponse = json.loads(response.body_as_unicode()) # Getting article items articles = [ content for _, content in jsonresponse['documents'].items() ] # Sorting them from the most recent to the most late one articles = sorted(articles, key=lambda x: x['published_at'], reverse=True) # Filtering out late articles and checking if we have reached the "until_date" filtered_articles = [] for content in articles: pub_date = datetime.strptime(content['pub_date'], '%Y-%m-%d').date() if pub_date >= self.until_date: filtered_articles.append(content) else: last_page = True # Iterating through news on this page for content in filtered_articles: full_url = self.article_link_tmpl.format(content['url']) yield scrapy.Request(url=full_url, callback=self.parse_document) # Requesting a new page if needed if not last_page and jsonresponse['has_next']: page_depth = response.meta.get('page_depth', 1) link_url = self.page_link_tmpl.format(page_depth) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={'page_depth': page_depth + 1}) def parse_document(self, response): for res in super().parse_document(response): for field in self.fields: if field not in res: res[field] = [''] for i, month in enumerate(self.months_ru): res['date'][0] = res['date'][0].replace(month, str(i + 1)) yield res
class GazetaSpider(NewsSpider): name = 'gazeta' start_urls = ['https://www.gazeta.ru/sitemap.shtml'] config = NewsSpiderConfig( title_path='//h1/text()', date_path='//time[contains(@itemprop, "datePublished")]/@datetime', date_format='%Y-%m-%dT%H:%M:%S%z', text_path='//div[contains(@itemprop, "articleBody")]//p//text() | ' '//span[contains(@itemprop, "description")]//text()', topics_path='//div[contains(@class, "active")]/a/span/text()', authors_path='//span[contains(@itemprop, "author")]//text()') sitemap_le = LinkExtractor( restrict_xpaths='//div[contains(@class, "sitemap_list")]/ul/ul') articles_le = LinkExtractor( restrict_xpaths='//h2[contains(@itemprop, "headline")]') news_le = LinkExtractor(restrict_css='div.article_text h1.txt_2b') def parse(self, response): for link in self.sitemap_le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_page) def parse_page(self, response): if 'news' in response.url: links = self.news_le.extract_links(response) else: links = self.articles_le.extract_links(response) pub_dts = response.xpath(self.config.date_path).extract() for pub_dt, link in zip(pub_dts, links): pub_dt = pub_dt[:-3] + pub_dt[-3:].replace( ':', '') # remove ":" for timezone correct parsing pub_dt = datetime.strptime(pub_dt, self.config.date_format) if pub_dt.date() >= self.until_date: yield scrapy.Request(url=link.url, callback=self.parse_document) # Determine if this is the last page if pub_dt.date() >= self.until_date: # Forming the next page link link_url = '{}?p=page&d={}'.format( self.start_urls[0], pub_dt.strftime('%d.%m.%Y_%H:%M')) yield scrapy.Request( url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1}) def parse_document(self, response): for res in super().parse_document(response): # Remove advertisement blocks ad_parts = ('\nРеклама\n', '\n.AdCentre_new_adv', ' AdfProxy.ssp', '\nset_resizeblock_handler') res['text'] = [ x.replace('\n', '\\n') for x in res['text'] if x != '\n' and not x.startswith(ad_parts) ] # Remove ":" in timezone pub_dt = res['date'][0] res['date'] = [pub_dt[:-3] + pub_dt[-3:].replace(':', '')] yield res
class RussiaTassSpider(NewsSpider): name = "tass" start_urls = ["https://tass.ru/"] config = NewsSpiderConfig( title_path='_', date_path='_', date_format="%Y-%m-%d %H:%M:%S", text_path="div.text-content>div.text-block ::text", topics_path='_') custom_settings = { "DEPTH_LIMIT": 4, "DEPTH_STATS": True, "DEPTH_STATS_VERBOSE": True, "DOWNLOAD_DELAY": 10, "RANDOMIZE_DOWNLOAD_DELAY": True, } category_le = LinkExtractor( restrict_css= 'ul.menu-sections-list>li>div.menu-sections-list__title-wrapper') def parse(self, response): for link in self.category_le.extract_links(response): yield scrapy.Request(url=link.url, priority=100, callback=self.parse_news_category, meta={}) def parse_news_category(self, response): news_section = response.css( "section#news-list::attr(ng-init)").extract_first(default="") section_id = re.findall("sectionId\s+=\s+(.*?);", news_section)[0] exclude_ids = re.findall("excludeNewsIds\s*?=\s*?\'(.*)\';", news_section)[0] paging_data = { "sectionId": int(section_id), "limit": 20, "type": "", "excludeNewsIds": exclude_ids, "imageSize": 434, } yield self._create_api_request(paging_data, response.url) def _create_api_request(self, data, referer): return scrapy.Request(url="https://tass.ru/userApi/categoryNewsList", method="POST", body=json.dumps(data), dont_filter=True, headers={ 'Content-Type': 'application/json', 'Referer': referer }, callback=self.parse_news_list, meta={ "data": data, "referer": referer }) def parse_news_list(self, response): news_data = json.loads(response.body) last_time = news_data.get("lastTime", 0) data = response.meta["data"] referer = response.meta["referer"] data["timestamp"] = last_time yield self._create_api_request(data, referer) for news_item in news_data["newsList"]: url = response.urljoin(news_item["link"]) yield scrapy.Request(url, callback=self.parse_document, meta={"news_item": news_item}) def parse_document(self, response): news_item = response.meta["news_item"] url = response.url base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] l = ItemLoader(item=Document(), response=response) l.add_value('url', url) l.add_value('edition', '-' if edition == base_edition else edition) l.add_value('title', news_item["title"]) l.add_value('topics', "") l.add_value( 'date', datetime.fromtimestamp(news_item["date"]).strftime( self.config.date_format)) l.add_css('text', self.config.text_path) yield l.load_item()
class KommersantSpider(NewsSpider): name = "kommersant" base_url = "https://www.kommersant.ru" link_tmpl = "https://www.kommersant.ru/archive/list/77/{}" # Start with the current date page_dt = datetime.now() start_urls = [link_tmpl.format(page_dt.strftime("%Y-%m-%d"))] # Ignore "robots.txt" for this spider only custom_settings = {"ROBOTSTXT_OBEY": "False"} config = NewsSpiderConfig( title_path='(.//*[@class="article_name"])[1]//text()', date_path='//meta[contains(@property, "published_time")]/@content', date_format="%Y-%m-%dT%H:%M:%S%z", # 2019-03-09T12:03:10+03:00 text_path='//p[@class="b-article__text"]//text()', topics_path='//meta[contains(@name, "category")]/@content', authors_path='//p[contains(@class, "document_authors")]//text()', reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path= '//span[contains(@class, "comments-number hide1 hide2")]/text()', ) news_le = LinkExtractor( restrict_xpaths='//div[@class="archive_result__item_text"]') def parse(self, response): # Parse most recent news for i in self.news_le.extract_links(response): yield scrapy.Request(url=i.url, callback=self.parse_document, meta={"page_dt": self.page_dt}) # If it's not the end of the page, request more news from archive by calling recursive "parse_page" function more_link = response.xpath( '//button[contains(@class, "lazyload-button")]/@data-lazyload-url' ).extract() if more_link: yield scrapy.Request( url="{}{}".format(self.base_url, more_link[0]), callback=self.parse_page, meta={"page_dt": self.page_dt}, ) # Requesting the next page if we need to self.page_dt -= timedelta(days=1) if self.page_dt.date() >= self.until_date: link_url = self.link_tmpl.format(self.page_dt.strftime("%Y-%m-%d")) yield scrapy.Request( url=link_url, priority=100, callback=self.parse, meta={ "page_depth": response.meta.get("page_depth", 1) + 1, "page_dt": self.page_dt }, ) def parse_page(self, response): # Parse all articles on page for i in self.news_le.extract_links(response): yield scrapy.Request(url=i.url, callback=self.parse_document) # Take a link from "more" button more_link = response.xpath( '//button[contains(@class, "lazyload-button")]/@data-lazyload-url' ).extract() if more_link: yield scrapy.Request( url="{}{}".format(self.base_url, more_link[0]), callback=self.parse_page, meta={ "page_depth": response.meta.get("page_depth", 1), "page_dt": response.meta["page_dt"] }, ) def parse_document(self, response): for res in super().parse_document(response): # If it's a gallery (no text) or special project then don't return anything (have another html layout) if "text" not in res or "title" not in res: break # Remove ":" in timezone pub_dt = res["date"][0] res["date"] = [pub_dt[:-3] + pub_dt[-3:].replace(":", "")] yield res
class MeduzaSpider(NewsSpider): name = "meduza" page_link_tmpl = "https://meduza.io/api/v3/search?chrono=news&page={}&per_page=24&locale=ru" article_link_tmpl = "https://meduza.io/{}" start_urls = [page_link_tmpl.format(0)] months_ru = [ "января", "февраля", "марта", "апреля", "мая", "июня", "июля", "августа", "сентября", "октября", "ноября", "декабря", ] fields = [ "title", "topics", "authors", "edition", "url", "text", "date", ] config = NewsSpiderConfig( title_path= '//h1[@class="RichTitle-root" or @class="SimpleTitle-root" or ' + '@class="RichTitle-root RichTitle-slide"]//text()', date_path= '//div[@class="GeneralMaterial-materialHeader" or @class="Slide-slide"]//time/text()', date_format="%H:%M, %d %m %Y", text_path= '//div[@class="GeneralMaterial-article" or @class="SlidesMaterial-layout" ' + 'or @class="MediaCaption-caption"]//p//text() | //div[@class="MediaCaption-caption"]//text() | ' + '//p[@class="SimpleBlock-p" or @class="SimpleBlock-lead"]//text()', topics_path="_", authors_path="_", reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path="_", ) def parse(self, response): last_page = False jsonresponse = json.loads(response.body_as_unicode()) # Getting article items articles = [ content for _, content in jsonresponse["documents"].items() ] # Sorting them from the most recent to the most late one articles = sorted(articles, key=lambda x: x["published_at"], reverse=True) # Filtering out late articles and checking if we have reached the "until_date" filtered_articles = [] for content in articles: pub_date = datetime.strptime(content["pub_date"], "%Y-%m-%d").date() if pub_date >= self.until_date: filtered_articles.append(content) else: last_page = True # Iterating through news on this page for content in filtered_articles: full_url = self.article_link_tmpl.format(content["url"]) yield scrapy.Request(url=full_url, callback=self.parse_document) # Requesting a new page if needed if not last_page and jsonresponse["has_next"]: page_depth = response.meta.get("page_depth", 1) link_url = self.page_link_tmpl.format(page_depth) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={"page_depth": page_depth + 1}) def parse_document(self, response): for res in super().parse_document(response): for field in self.fields: if field not in res: res[field] = [""] for i, month in enumerate(self.months_ru): res["date"][0] = res["date"][0].replace(month, str(i + 1)) yield res
class IzSpider(NewsSpider): name = "iz" start_urls = ["https://iz.ru/sitemap.xml"] config = NewsSpiderConfig( title_path='//h1[contains(@itemprop, "headline")]/span/text()', date_path='//meta[contains(@property, "published_time")]/@content', date_format="%Y-%m-%dT%H:%M:%S%z", text_path="//article//p//text()", topics_path='//div[contains(@itemprop, "genre")]//' 'a[contains(@href, "rubric") or contains(@href, "press-release")]//text()', authors_path='//div[contains(@itemprop, "author")]//a[contains(@href, "author")]//text()', reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path="_", ) def parse(self, response): """Parse first main sitemap.xml by initial parsing method. Getting sub_sitemaps. """ body = response.body links = Selector(text=body).xpath("//loc/text()").getall() # Parse last sitemap xml number # (in this case: "1"): https://iz.ru/export/sitemap/1/xml sitemap_n = int(links[-1].split("sitemap/")[1].split("/")[0]) # Get last empty sitemap link (main "sitemap.xml" on this site isn't updated frequently enough) # by iterating sitemap links adding "number" to it sitemap_n += 1 while True: link = "https://iz.ru/export/sitemap/{}/xml".format(sitemap_n) body = requests.get(link).content sitemap_links = Selector(text=body).xpath("//loc/text()").getall() # If there are links in this sitemap if sitemap_links: links.append(link) sitemap_n += 1 else: break # Get all links from sitemaps until reach "until_date" for link in links[::-1]: yield Request(url=link, callback=self.parse_sitemap) def parse_sitemap(self, response): # Parse sub sitemaps body = response.body links = Selector(text=body).xpath("//loc/text()").getall() last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall() # Sort news by modification date descending news = [(link, last_modif_dt) for link, last_modif_dt in zip(links, last_modif_dts)] sorted_news = sorted(news, key=lambda x: x[1], reverse=True) # Iterate news and parse them for link, last_modif_dt in sorted_news: # Convert last_modif_dt to datetime last_modif_dt = datetime.strptime(last_modif_dt, "%Y-%m-%d") if last_modif_dt.date() >= self.until_date: yield Request(url=link, callback=self.parse_document) def parse_document(self, response): for res in super().parse_document(response): # Remove ":" in timezone pub_dt = res["date"][0] res["date"] = [pub_dt[:-3] + pub_dt[-3:].replace(":", "")] # If it is a video article, allow it not to have text if "/video/" in res["url"][0]: if "text" not in res: res["text"] = [""] yield res def _get_last_page_dt(self, link): body = requests.get(link).content pub_dts = Selector(text=body).xpath("//lastmod/text()").getall() return datetime.strptime(pub_dts[0], "%Y-%m-%d")
class VedomostiSpider(NewsSpider): name = 'vedomosti' start_urls = ['https://www.vedomosti.ru/newsline'] config = NewsSpiderConfig( title_path='(.//div[contains(@class, "b-news-item__title")]//h1)[1]/text()', date_path='//time[@class="b-newsline-item__time"]/@pubdate', date_format='%Y-%m-%d %H:%M:%S %z', # 2019-03-02 20:08:47 +0300 text_path='(.//div[contains(@class, "b-news-item__text")])[1]/p//text()', topics_path='(.//div[contains(@class, "io-category")])[1]/text()', authors_path='_' ) news_le = LinkExtractor(restrict_xpaths='//div[contains(@class, "b-newsline-item__title")]') def parse(self, response): if response.meta.get('page_depth', 1) > 1: # If this is the second page and later we get a JSON object with html field in response, # so we should reform a response object and get links the other way d = json.loads(response.body.decode('utf-8'))['html'] resp = HtmlResponse(url=response.url, body=d, encoding='utf8') links = ['https://www.vedomosti.ru/{}'.format(i) for i in resp.xpath('//a/@href').extract()] # Getting publication date for every article pub_dts = resp.xpath(self.config.date_path).extract() # Convert datetimes of publication from string to datetime pub_dts = [datetime.strptime(dt, self.config.date_format) for dt in pub_dts] else: # Getting publication date for every article pub_dts = response.xpath(self.config.date_path).extract() # Convert datetimes of publication from string to datetime pub_dts = [datetime.strptime(dt, self.config.date_format) for dt in pub_dts] links = [i.url for i in self.news_le.extract_links(response)] for link, pub_dt in zip(links, pub_dts): if pub_dt.date() >= self.until_date: yield scrapy.Request(url=link, callback=self.parse_document, meta={'date': pub_dt}) # Get the last page in the page to see, whether we need another page last_dt = list(pub_dts)[-1] # Determine if this is the last page if last_dt.date() >= self.until_date: # Example: https://www.vedomosti.ru/newsline/more/2019-02-27%2017:18:41%20+0300 link_url = '{}/more/{}%20{}%20{}'.format(self.start_urls[0], last_dt.strftime('%Y-%m-%d'), last_dt.strftime('%H:%M:%S'), last_dt.strftime('%z')) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1} ) def parse_document(self, response): for res in super().parse_document(response): res['date'] = [response.meta.get('date').strftime(self.config.date_format)] all_text = [text.strip() for text in res['text']] all_title = [text.strip() for text in res['title']] all_topic = [text.strip() for text in res['topics']] res['topics'] = [' '.join(all_topic)] res['title'] = [' '.join(all_title)] res['text'] = [' '.join(all_text)] yield res
class RussiaTodaySpider(NewsSpider): name = "rt" start_urls = ["https://russian.rt.com/sitemap.xml"] config = NewsSpiderConfig( title_path="//h1/text()", date_path='//meta[contains(@name, "mediator_published_time")]/@content' ' | //span[@class="main-page-heading__date"]/text()', date_format="%Y-%m-%dT%H:%M:%S", text_path='//div[contains(@class, "article__text")]' '//*[not(contains(@class, "read-more")) and ' 'not(contains(@class, "article__cover"))]//text()' ' | //meta[contains(@name, "description")]/@content' ' | //div[@class="page-content"]/p/text()' ' | //div[@class="page-content"]/blockquote/p/text()' ' | //div[@class="page-content"]/p/a/text()' ' | //div[@class="page-content"]/h2/strong/text()', topics_path='//meta[contains(@name, "mediator_theme")]/@content' ' | //h2[@class="main-page-heading__tag"]/text()', authors_path='//meta[contains(@name, "mediator_author")]/@content' ' | //span[@class="main-page-heading__author"]/text()', reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path="_", ) def parse(self, response): """Parse first main sitemap.xml by initial parsing method. Getting sub_sitemaps. """ body = response.body links = Selector(text=body).xpath("//loc/text()").getall() for link in links: yield Request(url=link, callback=self.parse_sitemap) def parse_sitemap(self, response): """Parse each sub_sitemap. There is no today's news. """ body = response.body links = Selector(text=body).xpath("//loc/text()").getall() lm_datetimes = Selector(text=body).xpath("//lastmod/text()").getall() for i in range(len(links)): if "https://russian.rt.com/tag/" not in links[i]: if (datetime.strptime(lm_datetimes[i][:22] + "00", "%Y-%m-%dT%H:%M:%S%z").date() >= self.until_date): yield Request(url=links[i], callback=self.parse_document) def fix_date(self, raw_date): """Fix date for regular and authors articles """ months_ru = [ "января", "февраля", "марта", "апреля", "мая", "июня", "июля", "августа", "сентября", "октября", "ноября", "декабря", ] if len(raw_date[0]) == 25: raw_date[0] = raw_date[0][:19] return raw_date else: for i, month in enumerate(months_ru): raw_date[0] = raw_date[0].replace(month, str(i + 1)) return datetime.strptime(raw_date[0], "%d %m %Y,").strftime("%Y-%m-%dT%H:%M:%S") def cut_instagram(self, raw_text): """Cut instagram quote """ clear_text = [] i = 0 while i < len(raw_text): if " Посмотреть эту публикацию в Instagram" == raw_text[i]: while "PDT" not in raw_text[i]: i += 1 i += 1 else: clear_text.append(raw_text[i]) i += 1 return clear_text def parse_document(self, response): """Final parsing method. Parse each article.""" for item in super().parse_document(response): item["date"] = self.fix_date(item["date"]) item["text"] = self.cut_instagram(item["text"]) yield item
class KommersantSpider(NewsSpider): name = 'kommersant' base_url = 'https://www.kommersant.ru' link_tmpl = 'https://www.kommersant.ru/archive/list/77/{}' # Start with the current date page_dt = datetime.now() start_urls = [link_tmpl.format(page_dt.strftime('%Y-%m-%d'))] # Ignore "robots.txt" for this spider only custom_settings = {'ROBOTSTXT_OBEY': 'False'} config = NewsSpiderConfig( title_path='(.//*[@class="article_name"])[1]//text()', date_path='//meta[contains(@property, "published_time")]/@content', date_format='%Y-%m-%dT%H:%M:%S%z', # 2019-03-09T12:03:10+03:00 text_path='//p[@class="b-article__text"]//text()', topics_path='//meta[contains(@name, "category")]/@content', authors_path='//p[contains(@class, "document_authors")]//text()') news_le = LinkExtractor( restrict_xpaths='//div[@class="archive_result__item_text"]') def parse(self, response): # Parse most recent news for i in self.news_le.extract_links(response): yield scrapy.Request(url=i.url, callback=self.parse_document, meta={'page_dt': self.page_dt}) # If it's not the end of the page, request more news from archive by calling recursive "parse_page" function more_link = response.xpath( '//button[contains(@class, "lazyload-button")]/@data-lazyload-url' ).extract() if more_link: yield scrapy.Request(url='{}{}'.format(self.base_url, more_link[0]), callback=self.parse_page, meta={'page_dt': self.page_dt}) # Requesting the next page if we need to self.page_dt -= timedelta(days=1) if self.page_dt.date() >= self.until_date: link_url = self.link_tmpl.format(self.page_dt.strftime('%Y-%m-%d')) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={ 'page_depth': response.meta.get('page_depth', 1) + 1, 'page_dt': self.page_dt }) def parse_page(self, response): # Parse all articles on page for i in self.news_le.extract_links(response): yield scrapy.Request(url=i.url, callback=self.parse_document) # Take a link from "more" button more_link = response.xpath( '//button[contains(@class, "lazyload-button")]/@data-lazyload-url' ).extract() if more_link: yield scrapy.Request(url='{}{}'.format(self.base_url, more_link[0]), callback=self.parse_page, meta={ 'page_depth': response.meta.get('page_depth', 1), 'page_dt': response.meta['page_dt'] }) def parse_document(self, response): for res in super().parse_document(response): # If it's a gallery (no text) or special project then don't return anything (have another html layout) if 'text' not in res or 'title' not in res: break # Remove ":" in timezone pub_dt = res['date'][0] res['date'] = [pub_dt[:-3] + pub_dt[-3:].replace(':', '')] yield res
class GazetaSpider(NewsSpider): name = "gazeta" start_urls = ["https://www.gazeta.ru/sitemap.xml"] config = NewsSpiderConfig( title_path='//div[contains(@itemprop, "alternativeHeadline")]//text() | ' "//h1/text()", date_path='//time[contains(@itemprop, "datePublished")]/@datetime', date_format="%Y-%m-%dT%H:%M:%S%z", text_path='//div[contains(@itemprop, "articleBody")]//p//text() | ' '//span[contains(@itemprop, "description")]//text()', topics_path='//div[contains(@class, "active")]/a/span/text()', authors_path='//span[contains(@itemprop, "author")]//text()', reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path="_", ) def parse(self, response): # Parse main sitemap body = response.body links = Selector(text=body).xpath("//loc/text()").getall() last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall() for link, last_modif_dt in zip(links, last_modif_dts): # Convert last_modif_dt to datetime last_modif_dt = datetime.strptime(last_modif_dt.replace(":", ""), "%Y-%m-%dT%H%M%S%z") if last_modif_dt.date() >= self.until_date: yield Request(url=link, callback=self.parse_sub_sitemap) def parse_sub_sitemap(self, response): # Parse sub sitemaps body = response.body links = Selector(text=body).xpath("//loc/text()").getall() last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall() for link, last_modif_dt in zip(links, last_modif_dts): # Convert last_modif_dt to datetime last_modif_dt = datetime.strptime(last_modif_dt.replace(":", ""), "%Y-%m-%dT%H%M%S%z") if last_modif_dt.date() >= self.until_date: yield Request(url=link, callback=self.parse_articles_sitemap) def parse_articles_sitemap(self, response): # Parse sub sitemaps body = response.body links = Selector(text=body).xpath("//loc/text()").getall() last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall() for link, last_modif_dt in zip(links, last_modif_dts): # Convert last_modif_dt to datetime last_modif_dt = datetime.strptime(last_modif_dt.replace(":", ""), "%Y-%m-%dT%H%M%S%z") if last_modif_dt.date() >= self.until_date: if link.endswith(".shtml") and not link.endswith("index.shtml"): yield Request(url=link, callback=self.parse_document) def parse_document(self, response): for res in super().parse_document(response): # Remove advertisement blocks ad_parts = ("\nРеклама\n", "\n.AdCentre_new_adv", " AdfProxy.ssp", "\nset_resizeblock_handler") res["text"] = [x.replace("\n", "\\n") for x in res["text"] if x != "\n" and not x.startswith(ad_parts)] # Remove ":" in timezone pub_dt = res["date"][0] res["date"] = [pub_dt[:-3] + pub_dt[-3:].replace(":", "")] yield res
class RussiaTodaySpider(NewsSpider): name = 'rt' start_urls = ['https://russian.rt.com/sitemap.xml'] config = NewsSpiderConfig( title_path='//h1/text()', date_path='//meta' '[contains(@name, "mediator_published_time")]/@content', date_format="%Y-%m-%dT%H:%M:%S", text_path='//div[contains(@class, "article__text")]//text()', topics_path='//meta[contains(@name, "mediator_theme")]/@content', authors_path='_', reposts_fb_path='_', reposts_vk_path='_', reposts_ok_path='_', reposts_twi_path='_', reposts_lj_path='_', reposts_tg_path='_', likes_path='_', views_path='_', comm_count_path='_') def parse(self, response): """Parse first main sitemap.xml by initial parsing method. Getting sub_sitemaps. """ body = response.body links = Selector(text=body).xpath('//loc/text()').getall() for link in links: yield Request(url=link, callback=self.parse_sitemap) def parse_sitemap(self, response): """Parse each sub_sitemap. """ body = response.body links = Selector(text=body).xpath('//loc/text()').getall() for link in links: yield Request(url=link, callback=self.parse_document) def _fix_syntax(self, sample: List[str], idx_split: int) -> List[str]: """Fix timestamp syntax, droping timezone postfix. """ sample = [sample[0][:idx_split]] return sample def _get_date(self, lst: List[str]): """Convert list into date obj. """ y, m, d = [int(num) for num in lst] return date(y, m, d) def parse_document(self, response): """Final parsing method. Parse each article.""" for item in super().parse_document(response): # Try to drop timezone postfix. try: item['date'] = self._fix_syntax(item['date'], -6) except KeyError: print('Error. No date value.') else: raw_date = item['date'][0][:10].split('-') processed_date = self._get_date(raw_date) if processed_date >= self.until_date: yield item
class RbcSpider(NewsSpider): name = "rbc" link_tmpl = "https://www.rbc.ru/v10/ajax/get-news-feed/project/rbcnews/lastDate/{}/limit/22" start_urls = [link_tmpl.format(int(time.time()))] config = NewsSpiderConfig( title_path='(.//span[contains(@class, "js-slide-title")])[1]//text()', date_path="_", date_format="%Y-%m-%d %H:%M:%S", text_path='(.//div[contains(@class, "article__text")])' '/*[not(self::script) and not(self::div[@class="subscribe-infographic"])]//text()', topics_path='(.//a[contains(@class, "article__header__category")])[1]//text()', authors_path='//div[contains(@class, "article__authors")]/text()', reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path="_", views_path="_", comm_count_path="_", ) def parse(self, response): items = json.loads(response.body.decode("utf-8"))["items"] pub_dt = None for i in items: resp = HtmlResponse(url="", body=i["html"], encoding="utf8") link = resp.xpath("//a/@href").extract()[0] pub_dt = datetime.fromtimestamp(i["publish_date_t"]) if pub_dt.date() >= self.until_date: yield scrapy.Request(url=link, callback=self.parse_document, meta={"pub_dt": pub_dt}) # Requesting page if publication date of the last article is above "until_date" if pub_dt and pub_dt.date() >= self.until_date: # Forming the next page link link_url = self.link_tmpl.format(int(pub_dt.timestamp())) yield scrapy.Request( url=link_url, priority=100, callback=self.parse, meta={"page_depth": response.meta.get("page_depth", 1) + 1}, ) def parse_document(self, response): for res in super().parse_document(response): res["date"] = [response.meta["pub_dt"].strftime(self.config.date_format)] # If the article is located in "www.rbc.ru" url, then return it # (not "sportrbc.ru", "delovtb.rbc.ru" e t.c. because they have another html layout) if res["edition"][0] == "-": if "authors" in res: res["authors"] = [ i.replace("\n", "").strip() for i in res["authors"] if i.replace("\n", "").strip() ] res["text"] = [i.replace("\xa0", " ") for i in res["text"]] yield res
class RiaSpider(NewsSpider): name = "ria" start_urls = ["https://www.ria.ru"] config = NewsSpiderConfig( title_path='//h1[contains(@class, "article__title")]/text()', date_path='//div[contains(@class, "endless__item")]/@data-published', date_format="%Y-%m-%dT%H:%M", text_path= '//div[contains(@class, "article__block") and @data-type = "text"]//text()', topics_path='//a[contains(@class, "article__tags-item")]/text()', authors_path="_", reposts_fb_path="_", reposts_vk_path="_", reposts_ok_path="_", reposts_twi_path="_", reposts_lj_path="_", reposts_tg_path="_", likes_path='//span[contains(@class,"m-value")]/text()', views_path='//span[contains(@class,"statistic__item m-views")]/text()', comm_count_path="_", ) news_le = LinkExtractor(restrict_css="div.lenta__item") def parse(self, response): article_links = self.news_le.extract_links(response) last_link = "" for link in article_links: last_link = link.url yield scrapy.Request(url=link.url, callback=self.parse_document) dt = self._get_last_dt_on_page(last_link) if datetime.strptime( dt, self.config.date_format).date() >= self.until_date: # Getting and forming the next page link next_page_link = response.xpath( '//div[contains(@class, "lenta__item")]/@data-next').extract( )[0] link_url = "{}{}".format(self.start_urls[0], next_page_link) yield scrapy.Request( url=link_url, priority=100, callback=self.parse, meta={"page_depth": response.meta.get("page_depth", 1) + 1}, ) def parse_document(self, response): for res in super().parse_document(response): # Leave only the last tag # (the last tag is always a global website tag) res["topics"] = [res["topics"][-1]] yield res def _get_last_dt_on_page(self, link): r = requests.get(link) source_code = r.text root = lxml.html.fromstring(source_code) dt = root.xpath(self.config.date_path)[0] return dt
class MeduzaSpider(NewsSpider): name = 'meduza' # Page link template page_link_tmpl = 'https://meduza.io/api/v3/search?chrono=news&page={}&per_page=24&locale=ru' # Article link template article_link_tmpl = 'https://meduza.io/api/w4/{}' # Start with the first page start_urls = [page_link_tmpl.format(0)] config = NewsSpiderConfig( title_path='_', date_path='_', date_format='%Y-%m-%d %H:%M:%S', text_path='_', topics_path='_', authors_path='_' ) def parse(self, response): last_page = False jsonresponse = json.loads(response.body_as_unicode()) # Getting article items articles = [content for _, content in jsonresponse['documents'].items()] # Sorting them from the most recent to the most late one articles = sorted(articles, key=lambda x: x['published_at'], reverse=True) # Filtering out late articles and checking if we have reached the "until_date" filtered_articles = [] for content in articles: pub_date = datetime.strptime(content['pub_date'], '%Y-%m-%d').date() if pub_date >= self.until_date: filtered_articles.append(content) else: last_page = True # Iterating through news on this page for content in filtered_articles: full_url = self.article_link_tmpl.format(content['url']) yield scrapy.Request(url=full_url, callback=self.parse_document) # Requesting a new page if needed if not last_page and jsonresponse['has_next']: page_depth = response.meta.get('page_depth', 1) link_url = self.page_link_tmpl.format(page_depth) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={'page_depth': page_depth + 1} ) def parse_document(self, response): news_item = json.loads(response.body_as_unicode())['root'] url = 'https://meduza.io/{}'.format(news_item['url']) # Taking all blocks from response with information blocks = self._get_text_blocks(news_item) # Extract text paragraphs from every block of the article text_paragraphs = self._extract_text_from_blocks(blocks) base_edition = urlsplit(self.start_urls[0])[1] edition = urlsplit(url)[1] # Replace every \xa0 with space text_paragraphs = [text.replace('\xa0', ' ') for text in text_paragraphs] title = news_item['title'].replace('\xa0', ' ') # Constructing the resulting item l = ItemLoader(item=Document(), response=response) l.add_value('url', url) l.add_value('edition', '-' if edition == base_edition else edition) l.add_value('title', title) l.add_value('topics', '') l.add_value('date', datetime.utcfromtimestamp(news_item['datetime']).strftime(self.config.date_format)) l.add_value('text', text_paragraphs if text_paragraphs else ['']) l.add_value('authors', news_item['source']['name'] if 'source' in news_item else ['']) yield l.load_item() def _extract_text_from_blocks(self, blocks): text_paragraphs = [] # Block types which contain text block_types = ['p', 'context_p', 'blockquote', 'image', 'h3', 'card_title', 'ul', 'lead'] for block in blocks: if block['type'] in block_types: if block['type'] == 'image': text_paragraphs.append(block['data'].get('caption', '')) elif block['type'] == 'card_title': text_paragraphs.append(block['data'].get('text', '')) elif block['type'] == 'ul': for one_elem in block['data']: text_paragraphs.append(one_elem) else: # Paragraphs can be empty (without text) text_paragraphs.append(block.get('data', '')) return text_paragraphs def _get_text_blocks(self, news_item): blocks = [] # Get all blocks with data depending on article type (news, slides, cards) if 'blocks' in news_item['content']: blocks = news_item['content']['blocks'] elif 'slides' in news_item['content']: # Joining all slides into a list of blocks for one_slide in news_item['content']['slides']: for block in one_slide['blocks']: blocks.append(block) elif 'cards' in news_item['content']: # Joining all cards into a list of blocks for one_slide in news_item['content']['cards']: for block in one_slide['blocks']: blocks.append(block) return blocks