Exemplo n.º 1
0
class InterfaxSpider(NewsSpider):
    name = "interfax"
    start_urls = ["https://www.interfax.ru/news/2008/02/11"]
    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path=
        '//div[contains(@class, "tMC_head")]/meta[contains(@itemprop, "datePublished")]/@content',
        date_format="%Y-%m-%dT%H:%M:%S",
        text_path='//article//text()',
        topics_path='//div[contains(@class, "textML")]/a/text()')

    def parse(self, response):
        today = datetime.datetime.today()
        first_day = datetime.datetime(year=2008, month=2, day=11)
        date_range = [
            first_day + datetime.timedelta(days=x)
            for x in range((today - first_day).days)
        ]
        for date in date_range:
            url = "https://www.interfax.ru/news/" + date.strftime("%Y/%m/%d")
            yield response.follow(url, self.parse_page)

    def parse_page(self, response):
        url = response.url
        page = int(url.split("page_")[-1]) if "page_" in url else 0
        for page_href in response.xpath(
                '//div[contains(@class, "pages")]/a/@href').extract():
            if page != 0:
                continue
            yield response.follow(page_href, self.parse_page)
        for document_href in response.xpath(
                '//div[contains(@class, "an")]/div/a/@href').extract():
            yield response.follow(document_href, self.parse_document)
Exemplo n.º 2
0
class TvZvezdaSpider(NewsSpider):
    name = "tvzvezda"
    start_urls = ["https://tvzvezda.ru/news"]
    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path='//div[contains(@class, "date_news")]//text()',
        date_format="%H:%M %d.%m.%Y",
        text_path='//div[contains(@class, "glav_text")]//text()',
        topics_path='//meta[contains(@property, "article:section")]/@content',
        authors_path='//div[contains(@class, "autor_news")]/a/text()',
        reposts_fb_path='_',
        reposts_vk_path='_',
        reposts_ok_path='_',
        reposts_twi_path='_',
        reposts_lj_path='_',
        reposts_tg_path='_',
        likes_path='_',
        views_path='_',
        comm_count_path='_'
    )
    news_le = LinkExtractor(restrict_css='div.js-ajax-receiver a.news_one')

    z=0
    visited_urls = []

    def parse(self, response):
        
        if response.url not in self.visited_urls:
            for link in self.news_le.extract_links(response):
              yield scrapy.Request(url=link.url, callback=self.parse_document)
        next_pages = response.xpath('//a[contains(@class, "all_news js-ajax-call")]/@href').extract()
        next_pages=next_pages[-1]
        new_url='20/'+str(self.z)+'/?_=1542171175300'
        self.z+=20
        yield response.follow(next_pages+new_url, callback=self.parse)
Exemplo n.º 3
0
class RiaSpider(NewsSpider):
    name = 'ria'
    start_urls = ['https://www.ria.ru']
    config = NewsSpiderConfig(
        title_path='//h1[contains(@class, "article__title")]/text()',
        date_path='//div[contains(@class, "endless__item")]/@data-published',
        date_format='%Y-%m-%dT%H:%MZ',
        text_path='//div[contains(@class, "article__block") and @data-type = "text"]//text()',
        topics_path='//a[contains(@class, "article__tags-item")]/text()',
        authors_path='_'
    )
    news_le = LinkExtractor(restrict_css='div.lenta__item')

    def parse(self, response):
        article_links = self.news_le.extract_links(response)

        last_link = ''
        for link in article_links:
            last_link = link.url

            yield scrapy.Request(url=link.url, callback=self.parse_document)

        dt = self._get_last_dt_on_page(last_link)

        if datetime.strptime(dt, self.config.date_format).date() >= self.until_date:
            # Getting and forming the next page link
            next_page_link = response.xpath('//div[contains(@class, "lenta__item")]/@data-next').extract()[0]
            link_url = '{}{}'.format(self.start_urls[0], next_page_link)

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={'page_depth': response.meta.get('page_depth', 1) + 1}
                                 )

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Leave only the last tag
            # (the last tag is always a global website tag)
            res['topics'] = [res['topics'][-1]]

            yield res

    def _get_last_dt_on_page(self, link):
        r = requests.get(link)
        source_code = r.text

        root = lxml.html.fromstring(source_code)

        dt = root.xpath(self.config.date_path)[0]

        return dt
Exemplo n.º 4
0
class RbcSpider(NewsSpider):
    name = 'rbc'
    link_tmpl = 'https://www.rbc.ru/v10/ajax/get-news-feed/project/rbcnews/lastDate/{}/limit/22'
    start_urls = [link_tmpl.format(int(time.time()))]
    config = NewsSpiderConfig(
        title_path='(.//span[contains(@class, "js-slide-title")])[1]//text()',
        date_path='_',
        date_format='%Y-%m-%d %H:%M:%S',
        text_path='(.//div[contains(@class, "article__text")])'
                  '/*[not(self::script) and not(self::div[@class="subscribe-infographic"])]//text()',
        topics_path='(.//a[contains(@class, "article__header__category")])[1]//text()',
        authors_path='//div[contains(@class, "article__authors")]/text()'
    )

    def parse(self, response):
        items = json.loads(response.body.decode('utf-8'))['items']

        pub_dt = None
        for i in items:
            resp = HtmlResponse(url='', body=i['html'], encoding='utf8')

            link = resp.xpath('//a/@href').extract()[0]
            pub_dt = datetime.fromtimestamp(i['publish_date_t'])

            if pub_dt.date() >= self.until_date:
                yield scrapy.Request(url=link,
                                     callback=self.parse_document,
                                     meta={"pub_dt": pub_dt})

        # Requesting page if publication date of the last article is above "until_date"
        if pub_dt and pub_dt.date() >= self.until_date:
            # Forming the next page link
            link_url = self.link_tmpl.format(int(pub_dt.timestamp()))

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={'page_depth': response.meta.get('page_depth', 1) + 1}
                                 )

    def parse_document(self, response):
        for res in super().parse_document(response):
            res['date'] = [response.meta["pub_dt"].strftime(self.config.date_format)]

            # If the article is located in "www.rbc.ru" url, then return it
            # (not "sportrbc.ru", "delovtb.rbc.ru" e t.c. because they have another html layout)
            if res['edition'][0] == '-':
                res['authors'] = [i.replace('\n', '').strip() for i in res['authors'] if i.replace('\n', '').strip()]
                res['text'] = [i.replace('\xa0', ' ') for i in res['text']]

                yield res
Exemplo n.º 5
0
class InterfaxSpider(NewsSpider):
    name = "interfax"

    start_urls = [
        "https://www.interfax.ru/news/{}".format(
            datetime.datetime.today().strftime("%Y/%m/%d"))
    ]
    config = NewsSpiderConfig(
        title_path='//h1[contains(@itemprop, "headline")]/text()',
        date_path='//meta[contains(@property, "published_time")]/@content',
        date_format="%Y-%m-%dT%H:%M%z",
        text_path=
        '//article[contains(@itemprop, "articleBody")]/p[not(contains(@itemprop, "author"))]//text()',
        topics_path='//aside[contains(@class, "textML")]/a//text()',
        authors_path='//p[contains(@itemprop, "author")]//text()',
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path="_",
    )

    def parse(self, response):
        page_date = datetime.datetime.today().date()

        while page_date >= self.until_date:
            url = "https://www.interfax.ru/news/" + page_date.strftime(
                "%Y/%m/%d")
            yield response.follow(url, self.parse_page)

            page_date -= datetime.timedelta(days=1)

    def parse_page(self, response):
        url = response.url
        page = int(url.split("page_")[-1]) if "page_" in url else 0
        for page_href in response.xpath(
                '//div[contains(@class, "pages")]/a/@href').extract():
            if page != 0:
                continue
            yield response.follow(page_href, self.parse_page)
        for document_href in response.xpath(
                '//div[contains(@class, "an")]/div/a/@href').extract():
            yield response.follow(document_href, self.parse_document)
Exemplo n.º 6
0
class GazetaSpider(NewsSpider):
    name = 'gazeta'
    start_urls = ['https://www.gazeta.ru/news/']
    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path='//time[contains(@class, "date_time red")]/text()',
        date_format='%d.%m.%Y | %H:%M',
        text_path='//div[contains(@class, "article-text-body")]//text()',
        topics_path='//div[contains(@class, "active")]/a/span/text()')
    news_le = LinkExtractor(restrict_css='div.article_text h1.txt_2b')
    max_page_depth = 4

    def parse(self, response):
        if response.meta.get('page_depth', 1) < self.max_page_depth:
            # Get last article datetime on the current page
            last_page_dt = response.xpath(
                '//time[contains(@class, "txtclear")]/@datetime').extract()[-1]
            # Convert it to datetime without timezone part
            last_page_dt = datetime.strptime(last_page_dt[:-6],
                                             '%Y-%m-%dT%H:%M:%S')

            # Forming the next page link
            link_url = '{}?p=page&d={}'.format(
                self.start_urls[0], last_page_dt.strftime('%d.%m.%Y_%H:%M'))

            yield scrapy.Request(
                url=link_url,
                priority=100,
                callback=self.parse,
                meta={'page_depth': response.meta.get('page_depth', 1) + 1})

        for link in self.news_le.extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_document)

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Remove advertisement blocks
            ad_parts = ('\nРеклама\n', '\n.AdCentre_new_adv', ' AdfProxy.ssp',
                        '\nset_resizeblock_handler')

            res['text'] = [
                x for x in res['text']
                if x != '\n' and not x.startswith(ad_parts)
            ]

            yield res
Exemplo n.º 7
0
class RiaSpider(NewsSpider):
    name = 'ria'
    start_urls = ['https://www.ria.ru']
    config = NewsSpiderConfig(
        title_path='//h1[contains(@class, "article__title")]/text()',
        date_path='//div[contains(@class, "endless__item")]/@data-published',
        date_format='%Y-%m-%dT%H:%MZ',
        text_path=
        '//div[contains(@class, "article__block") and @data-type = "text"]//text()',
        topics_path='//a[contains(@class, "article__tags-item")]/text()')
    news_le = LinkExtractor(restrict_css='div.lenta__item')
    max_page_depth = 4

    def parse(self, response):
        article_links = self.news_le.extract_links(response)

        if response.meta.get('page_depth', 1) < self.max_page_depth:
            # Getting and forming the next page link
            next_page_link = response.xpath(
                '//div[contains(@class, "lenta__item")]/@data-next').extract(
                )[0]
            link_url = '{}{}'.format(self.start_urls[0], next_page_link)

            yield scrapy.Request(
                url=link_url,
                priority=100,
                callback=self.parse,
                meta={'page_depth': response.meta.get('page_depth', 1) + 1})

        for link in article_links:
            yield scrapy.Request(url=link.url, callback=self.parse_document)

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Leave only the last tag
            # (the last tag is always a global website tag)
            res['topics'] = [res['topics'][-1]]

            yield res
Exemplo n.º 8
0
class RussiaTodaySpider(NewsSpider):
    name = "rt"
    start_urls = ["https://russian.rt.com/news"]
    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path='//meta[contains(@name, "mediator_published_time")]/@content',
        date_format="%Y-%m-%dT%H:%M:%S",
        text_path='//div[contains(@class, "article__text")]//text()',
        topics_path='//meta[contains(@name, "mediator_theme")]/@content',
        authors_path='_')
    news_le = LinkExtractor(restrict_css='div.listing__card div.card__heading')
    page_le = LinkExtractor(
        restrict_css='div.listing__button.listing__button_js',
        tags=['div'],
        attrs=['data-href'])
    max_page_depth = 4

    def parse(self, response):
        if response.meta.get("page_depth", 1) < self.max_page_depth:
            for link in self.page_le.extract_links(response):
                yield scrapy.Request(url=link.url,
                                     priority=100,
                                     callback=self.parse,
                                     meta={
                                         "page_depth":
                                         response.meta.get("page_depth", 1) + 1
                                     })

        for link in self.news_le.extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_document)

    def parse_document(self, response):
        for res in super().parse_document(response):
            if isinstance(res, Document):
                if isinstance(res["date"], list):
                    res["date"] = [x[:-6] for x in res["date"] if x]
                else:
                    res["date"] = res["date"][:-6]
            yield res
Exemplo n.º 9
0
class IzSpider(NewsSpider):
    name = "iz"
    start_urls = ["https://iz.ru/feed"]
    config = NewsSpiderConfig(
        title_path='//h1/span/text()',
        date_path=
        '//div[contains(@class, "article_page__left__top__time__label")]/div/time/@datetime',
        date_format="%Y-%m-%dT%H:%M:%SZ",
        text_path='//div[contains(@itemprop, "articleBody")]/div/p//text()',
        topics_path='//div[contains(@class, "rubrics_btn")]/div/a/text()')

    visited_urls = []

    def parse(self, response):
        if response.url not in self.visited_urls:
            for link in response.xpath(
                    '//div[@class="lenta_news__day"]/div/a/@href').extract():
                url = urljoin(response.url, link)
                yield scrapy.Request(url=url, callback=self.parse_document)
        next_pages = response.xpath(
            '//a[contains(@class, "button")]/@href').extract()
        next_pages = next_pages[-1]
        yield response.follow(next_pages, callback=self.parse)
Exemplo n.º 10
0
class InterfaxSpider(NewsSpider):
    name = "interfax"

    start_urls = [
        "https://www.interfax.ru/news/{}".format(
            datetime.datetime.today().strftime("%Y/%m/%d"))
    ]
    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path=
        '//div[contains(@class, "tMC_head")]/meta[contains(@itemprop, "datePublished")]/@content',
        date_format="%Y-%m-%dT%H:%M:%S",
        text_path='//article//text()',
        topics_path='//div[contains(@class, "textML")]/a/text()',
        authors_path='_')

    def parse(self, response):
        page_date = datetime.datetime.today().date()

        while page_date >= self.until_date:
            url = "https://www.interfax.ru/news/" + page_date.strftime(
                "%Y/%m/%d")
            yield response.follow(url, self.parse_page)

            page_date -= datetime.timedelta(days=1)

    def parse_page(self, response):
        url = response.url
        page = int(url.split("page_")[-1]) if "page_" in url else 0
        for page_href in response.xpath(
                '//div[contains(@class, "pages")]/a/@href').extract():
            if page != 0:
                continue
            yield response.follow(page_href, self.parse_page)
        for document_href in response.xpath(
                '//div[contains(@class, "an")]/div/a/@href').extract():
            yield response.follow(document_href, self.parse_document)
Exemplo n.º 11
0
class MeduzaSpider(NewsSpider):
    name = 'meduza'
    page_link_tmpl = 'https://meduza.io/api/v3/search?chrono=news&page={}&per_page=24&locale=ru'
    article_link_tmpl = 'https://meduza.io/{}'
    start_urls = [page_link_tmpl.format(0)]
    months_ru = [
        'января',
        'февраля',
        'марта',
        'апреля',
        'мая',
        'июня',
        'июля',
        'августа',
        'сентября',
        'октября',
        'ноября',
        'декабря',
    ]
    fields = [
        'title',
        'topics',
        'authors',
        'edition',
        'url',
        'text',
        'date',
    ]

    config = NewsSpiderConfig(
        title_path=
        '//h1[@class="RichTitle-root" or @class="SimpleTitle-root" or ' +
        '@class="RichTitle-root RichTitle-slide"]//text()',
        date_path='//time[@class="Timestamp-root"]/text()',
        date_format='%H:%M, %d %m %Y',
        text_path=
        '//div[@class="GeneralMaterial-article" or @class="SlidesMaterial-layout" '
        +
        'or @class="MediaCaption-caption"]//p//text() | //div[@class="MediaCaption-caption"]//text() | '
        + '//p[@class="SimpleBlock-p" or @class="SimpleBlock-lead"]//text()',
        topics_path='_',
        authors_path='_',
        reposts_fb_path='_',
        reposts_vk_path='_',
        reposts_ok_path='_',
        reposts_twi_path='_',
        reposts_lj_path='_',
        reposts_tg_path='_',
        likes_path='_',
        views_path='_',
        comm_count_path='_',
    )

    def parse(self, response):
        last_page = False

        jsonresponse = json.loads(response.body_as_unicode())

        # Getting article items
        articles = [
            content for _, content in jsonresponse['documents'].items()
        ]
        # Sorting them from the most recent to the most late one
        articles = sorted(articles,
                          key=lambda x: x['published_at'],
                          reverse=True)

        # Filtering out late articles and checking if we have reached the "until_date"
        filtered_articles = []
        for content in articles:
            pub_date = datetime.strptime(content['pub_date'],
                                         '%Y-%m-%d').date()
            if pub_date >= self.until_date:
                filtered_articles.append(content)
            else:
                last_page = True

        # Iterating through news on this page
        for content in filtered_articles:
            full_url = self.article_link_tmpl.format(content['url'])

            yield scrapy.Request(url=full_url, callback=self.parse_document)

        # Requesting a new page if needed
        if not last_page and jsonresponse['has_next']:
            page_depth = response.meta.get('page_depth', 1)

            link_url = self.page_link_tmpl.format(page_depth)

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={'page_depth': page_depth + 1})

    def parse_document(self, response):
        for res in super().parse_document(response):
            for field in self.fields:
                if field not in res:
                    res[field] = ['']
            for i, month in enumerate(self.months_ru):
                res['date'][0] = res['date'][0].replace(month, str(i + 1))

            yield res
Exemplo n.º 12
0
class GazetaSpider(NewsSpider):
    name = 'gazeta'
    start_urls = ['https://www.gazeta.ru/sitemap.shtml']
    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path='//time[contains(@itemprop, "datePublished")]/@datetime',
        date_format='%Y-%m-%dT%H:%M:%S%z',
        text_path='//div[contains(@itemprop, "articleBody")]//p//text() | '
        '//span[contains(@itemprop, "description")]//text()',
        topics_path='//div[contains(@class, "active")]/a/span/text()',
        authors_path='//span[contains(@itemprop, "author")]//text()')
    sitemap_le = LinkExtractor(
        restrict_xpaths='//div[contains(@class, "sitemap_list")]/ul/ul')
    articles_le = LinkExtractor(
        restrict_xpaths='//h2[contains(@itemprop, "headline")]')
    news_le = LinkExtractor(restrict_css='div.article_text h1.txt_2b')

    def parse(self, response):
        for link in self.sitemap_le.extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_page)

    def parse_page(self, response):
        if 'news' in response.url:
            links = self.news_le.extract_links(response)
        else:
            links = self.articles_le.extract_links(response)

        pub_dts = response.xpath(self.config.date_path).extract()
        for pub_dt, link in zip(pub_dts, links):
            pub_dt = pub_dt[:-3] + pub_dt[-3:].replace(
                ':', '')  # remove ":" for timezone correct parsing
            pub_dt = datetime.strptime(pub_dt, self.config.date_format)

            if pub_dt.date() >= self.until_date:
                yield scrapy.Request(url=link.url,
                                     callback=self.parse_document)

        # Determine if this is the last page
        if pub_dt.date() >= self.until_date:
            # Forming the next page link
            link_url = '{}?p=page&d={}'.format(
                self.start_urls[0], pub_dt.strftime('%d.%m.%Y_%H:%M'))

            yield scrapy.Request(
                url=link_url,
                priority=100,
                callback=self.parse,
                meta={'page_depth': response.meta.get('page_depth', 1) + 1})

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Remove advertisement blocks
            ad_parts = ('\nРеклама\n', '\n.AdCentre_new_adv', ' AdfProxy.ssp',
                        '\nset_resizeblock_handler')

            res['text'] = [
                x.replace('\n', '\\n') for x in res['text']
                if x != '\n' and not x.startswith(ad_parts)
            ]

            # Remove ":" in timezone
            pub_dt = res['date'][0]
            res['date'] = [pub_dt[:-3] + pub_dt[-3:].replace(':', '')]

            yield res
Exemplo n.º 13
0
class RussiaTassSpider(NewsSpider):
    name = "tass"
    start_urls = ["https://tass.ru/"]
    config = NewsSpiderConfig(
        title_path='_',
        date_path='_',
        date_format="%Y-%m-%d %H:%M:%S",
        text_path="div.text-content>div.text-block ::text",
        topics_path='_')
    custom_settings = {
        "DEPTH_LIMIT": 4,
        "DEPTH_STATS": True,
        "DEPTH_STATS_VERBOSE": True,
        "DOWNLOAD_DELAY": 10,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    category_le = LinkExtractor(
        restrict_css=
        'ul.menu-sections-list>li>div.menu-sections-list__title-wrapper')

    def parse(self, response):
        for link in self.category_le.extract_links(response):
            yield scrapy.Request(url=link.url,
                                 priority=100,
                                 callback=self.parse_news_category,
                                 meta={})

    def parse_news_category(self, response):
        news_section = response.css(
            "section#news-list::attr(ng-init)").extract_first(default="")

        section_id = re.findall("sectionId\s+=\s+(.*?);", news_section)[0]
        exclude_ids = re.findall("excludeNewsIds\s*?=\s*?\'(.*)\';",
                                 news_section)[0]

        paging_data = {
            "sectionId": int(section_id),
            "limit": 20,
            "type": "",
            "excludeNewsIds": exclude_ids,
            "imageSize": 434,
        }
        yield self._create_api_request(paging_data, response.url)

    def _create_api_request(self, data, referer):
        return scrapy.Request(url="https://tass.ru/userApi/categoryNewsList",
                              method="POST",
                              body=json.dumps(data),
                              dont_filter=True,
                              headers={
                                  'Content-Type': 'application/json',
                                  'Referer': referer
                              },
                              callback=self.parse_news_list,
                              meta={
                                  "data": data,
                                  "referer": referer
                              })

    def parse_news_list(self, response):
        news_data = json.loads(response.body)
        last_time = news_data.get("lastTime", 0)
        data = response.meta["data"]
        referer = response.meta["referer"]
        data["timestamp"] = last_time
        yield self._create_api_request(data, referer)
        for news_item in news_data["newsList"]:
            url = response.urljoin(news_item["link"])
            yield scrapy.Request(url,
                                 callback=self.parse_document,
                                 meta={"news_item": news_item})

    def parse_document(self, response):
        news_item = response.meta["news_item"]
        url = response.url
        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        l = ItemLoader(item=Document(), response=response)
        l.add_value('url', url)
        l.add_value('edition', '-' if edition == base_edition else edition)
        l.add_value('title', news_item["title"])
        l.add_value('topics', "")
        l.add_value(
            'date',
            datetime.fromtimestamp(news_item["date"]).strftime(
                self.config.date_format))
        l.add_css('text', self.config.text_path)
        yield l.load_item()
Exemplo n.º 14
0
class KommersantSpider(NewsSpider):
    name = "kommersant"

    base_url = "https://www.kommersant.ru"
    link_tmpl = "https://www.kommersant.ru/archive/list/77/{}"
    # Start with the current date
    page_dt = datetime.now()
    start_urls = [link_tmpl.format(page_dt.strftime("%Y-%m-%d"))]

    # Ignore "robots.txt" for this spider only
    custom_settings = {"ROBOTSTXT_OBEY": "False"}

    config = NewsSpiderConfig(
        title_path='(.//*[@class="article_name"])[1]//text()',
        date_path='//meta[contains(@property, "published_time")]/@content',
        date_format="%Y-%m-%dT%H:%M:%S%z",  # 2019-03-09T12:03:10+03:00
        text_path='//p[@class="b-article__text"]//text()',
        topics_path='//meta[contains(@name, "category")]/@content',
        authors_path='//p[contains(@class, "document_authors")]//text()',
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path=
        '//span[contains(@class, "comments-number hide1 hide2")]/text()',
    )
    news_le = LinkExtractor(
        restrict_xpaths='//div[@class="archive_result__item_text"]')

    def parse(self, response):
        # Parse most recent news
        for i in self.news_le.extract_links(response):
            yield scrapy.Request(url=i.url,
                                 callback=self.parse_document,
                                 meta={"page_dt": self.page_dt})

        # If it's not the end of the page, request more news from archive by calling recursive "parse_page" function
        more_link = response.xpath(
            '//button[contains(@class, "lazyload-button")]/@data-lazyload-url'
        ).extract()
        if more_link:
            yield scrapy.Request(
                url="{}{}".format(self.base_url, more_link[0]),
                callback=self.parse_page,
                meta={"page_dt": self.page_dt},
            )

        # Requesting the next page if we need to
        self.page_dt -= timedelta(days=1)
        if self.page_dt.date() >= self.until_date:
            link_url = self.link_tmpl.format(self.page_dt.strftime("%Y-%m-%d"))

            yield scrapy.Request(
                url=link_url,
                priority=100,
                callback=self.parse,
                meta={
                    "page_depth": response.meta.get("page_depth", 1) + 1,
                    "page_dt": self.page_dt
                },
            )

    def parse_page(self, response):
        # Parse all articles on page
        for i in self.news_le.extract_links(response):
            yield scrapy.Request(url=i.url, callback=self.parse_document)

        # Take a link from "more" button
        more_link = response.xpath(
            '//button[contains(@class, "lazyload-button")]/@data-lazyload-url'
        ).extract()
        if more_link:
            yield scrapy.Request(
                url="{}{}".format(self.base_url, more_link[0]),
                callback=self.parse_page,
                meta={
                    "page_depth": response.meta.get("page_depth", 1),
                    "page_dt": response.meta["page_dt"]
                },
            )

    def parse_document(self, response):
        for res in super().parse_document(response):
            # If it's a gallery (no text) or special project then don't return anything (have another html layout)
            if "text" not in res or "title" not in res:
                break

            # Remove ":" in timezone
            pub_dt = res["date"][0]
            res["date"] = [pub_dt[:-3] + pub_dt[-3:].replace(":", "")]

            yield res
Exemplo n.º 15
0
class MeduzaSpider(NewsSpider):
    name = "meduza"
    page_link_tmpl = "https://meduza.io/api/v3/search?chrono=news&page={}&per_page=24&locale=ru"
    article_link_tmpl = "https://meduza.io/{}"
    start_urls = [page_link_tmpl.format(0)]
    months_ru = [
        "января",
        "февраля",
        "марта",
        "апреля",
        "мая",
        "июня",
        "июля",
        "августа",
        "сентября",
        "октября",
        "ноября",
        "декабря",
    ]
    fields = [
        "title",
        "topics",
        "authors",
        "edition",
        "url",
        "text",
        "date",
    ]

    config = NewsSpiderConfig(
        title_path=
        '//h1[@class="RichTitle-root" or @class="SimpleTitle-root" or ' +
        '@class="RichTitle-root RichTitle-slide"]//text()',
        date_path=
        '//div[@class="GeneralMaterial-materialHeader" or @class="Slide-slide"]//time/text()',
        date_format="%H:%M, %d %m %Y",
        text_path=
        '//div[@class="GeneralMaterial-article" or @class="SlidesMaterial-layout" '
        +
        'or @class="MediaCaption-caption"]//p//text() | //div[@class="MediaCaption-caption"]//text() | '
        + '//p[@class="SimpleBlock-p" or @class="SimpleBlock-lead"]//text()',
        topics_path="_",
        authors_path="_",
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path="_",
    )

    def parse(self, response):
        last_page = False

        jsonresponse = json.loads(response.body_as_unicode())

        # Getting article items
        articles = [
            content for _, content in jsonresponse["documents"].items()
        ]
        # Sorting them from the most recent to the most late one
        articles = sorted(articles,
                          key=lambda x: x["published_at"],
                          reverse=True)

        # Filtering out late articles and checking if we have reached the "until_date"
        filtered_articles = []
        for content in articles:
            pub_date = datetime.strptime(content["pub_date"],
                                         "%Y-%m-%d").date()
            if pub_date >= self.until_date:
                filtered_articles.append(content)
            else:
                last_page = True

        # Iterating through news on this page
        for content in filtered_articles:
            full_url = self.article_link_tmpl.format(content["url"])

            yield scrapy.Request(url=full_url, callback=self.parse_document)

        # Requesting a new page if needed
        if not last_page and jsonresponse["has_next"]:
            page_depth = response.meta.get("page_depth", 1)

            link_url = self.page_link_tmpl.format(page_depth)

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={"page_depth": page_depth + 1})

    def parse_document(self, response):
        for res in super().parse_document(response):
            for field in self.fields:
                if field not in res:
                    res[field] = [""]
            for i, month in enumerate(self.months_ru):
                res["date"][0] = res["date"][0].replace(month, str(i + 1))

            yield res
Exemplo n.º 16
0
class IzSpider(NewsSpider):
    name = "iz"
    start_urls = ["https://iz.ru/sitemap.xml"]
    config = NewsSpiderConfig(
        title_path='//h1[contains(@itemprop, "headline")]/span/text()',
        date_path='//meta[contains(@property, "published_time")]/@content',
        date_format="%Y-%m-%dT%H:%M:%S%z",
        text_path="//article//p//text()",
        topics_path='//div[contains(@itemprop, "genre")]//'
        'a[contains(@href, "rubric") or contains(@href, "press-release")]//text()',
        authors_path='//div[contains(@itemprop, "author")]//a[contains(@href, "author")]//text()',
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path="_",
    )

    def parse(self, response):
        """Parse first main sitemap.xml by initial parsing method.
        Getting sub_sitemaps.
        """
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        # Parse last sitemap xml number
        # (in this case: "1"): https://iz.ru/export/sitemap/1/xml
        sitemap_n = int(links[-1].split("sitemap/")[1].split("/")[0])

        # Get last empty sitemap link (main "sitemap.xml" on this site isn't updated frequently enough)
        # by iterating sitemap links adding "number" to it
        sitemap_n += 1
        while True:
            link = "https://iz.ru/export/sitemap/{}/xml".format(sitemap_n)
            body = requests.get(link).content

            sitemap_links = Selector(text=body).xpath("//loc/text()").getall()
            # If there are links in this sitemap
            if sitemap_links:
                links.append(link)
                sitemap_n += 1
            else:
                break

        # Get all links from sitemaps until reach "until_date"
        for link in links[::-1]:
            yield Request(url=link, callback=self.parse_sitemap)

    def parse_sitemap(self, response):
        # Parse sub sitemaps
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall()

        # Sort news by modification date descending
        news = [(link, last_modif_dt) for link, last_modif_dt in zip(links, last_modif_dts)]
        sorted_news = sorted(news, key=lambda x: x[1], reverse=True)

        # Iterate news and parse them
        for link, last_modif_dt in sorted_news:
            # Convert last_modif_dt to datetime
            last_modif_dt = datetime.strptime(last_modif_dt, "%Y-%m-%d")

            if last_modif_dt.date() >= self.until_date:
                yield Request(url=link, callback=self.parse_document)

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Remove ":" in timezone
            pub_dt = res["date"][0]
            res["date"] = [pub_dt[:-3] + pub_dt[-3:].replace(":", "")]

            # If it is a video article, allow it not to have text
            if "/video/" in res["url"][0]:
                if "text" not in res:
                    res["text"] = [""]

            yield res

    def _get_last_page_dt(self, link):
        body = requests.get(link).content

        pub_dts = Selector(text=body).xpath("//lastmod/text()").getall()
        return datetime.strptime(pub_dts[0], "%Y-%m-%d")
Exemplo n.º 17
0
class VedomostiSpider(NewsSpider):
    name = 'vedomosti'
    start_urls = ['https://www.vedomosti.ru/newsline']
    config = NewsSpiderConfig(
        title_path='(.//div[contains(@class, "b-news-item__title")]//h1)[1]/text()',
        date_path='//time[@class="b-newsline-item__time"]/@pubdate',
        date_format='%Y-%m-%d %H:%M:%S %z',  # 2019-03-02 20:08:47 +0300
        text_path='(.//div[contains(@class, "b-news-item__text")])[1]/p//text()',
        topics_path='(.//div[contains(@class, "io-category")])[1]/text()',
        authors_path='_'
    )
    news_le = LinkExtractor(restrict_xpaths='//div[contains(@class, "b-newsline-item__title")]')

    def parse(self, response):
        if response.meta.get('page_depth', 1) > 1:
            # If this is the second page and later we get a JSON object with html field in response,
            # so we should reform a response object and get links the other way
            d = json.loads(response.body.decode('utf-8'))['html']
            resp = HtmlResponse(url=response.url, body=d, encoding='utf8')

            links = ['https://www.vedomosti.ru/{}'.format(i) for i in resp.xpath('//a/@href').extract()]

            # Getting publication date for every article
            pub_dts = resp.xpath(self.config.date_path).extract()
            # Convert datetimes of publication from string to datetime
            pub_dts = [datetime.strptime(dt, self.config.date_format) for dt in pub_dts]
        else:
            # Getting publication date for every article
            pub_dts = response.xpath(self.config.date_path).extract()
            # Convert datetimes of publication from string to datetime
            pub_dts = [datetime.strptime(dt, self.config.date_format) for dt in pub_dts]

            links = [i.url for i in self.news_le.extract_links(response)]

        for link, pub_dt in zip(links, pub_dts):
            if pub_dt.date() >= self.until_date:
                yield scrapy.Request(url=link, callback=self.parse_document, meta={'date': pub_dt})

        # Get the last page in the page to see, whether we need another page
        last_dt = list(pub_dts)[-1]

        # Determine if this is the last page
        if last_dt.date() >= self.until_date:
            # Example: https://www.vedomosti.ru/newsline/more/2019-02-27%2017:18:41%20+0300
            link_url = '{}/more/{}%20{}%20{}'.format(self.start_urls[0],
                                                     last_dt.strftime('%Y-%m-%d'),
                                                     last_dt.strftime('%H:%M:%S'),
                                                     last_dt.strftime('%z'))

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={'page_depth': response.meta.get('page_depth', 1) + 1}
                                 )

    def parse_document(self, response):
        for res in super().parse_document(response):
            res['date'] = [response.meta.get('date').strftime(self.config.date_format)]

            all_text = [text.strip() for text in res['text']]
            all_title = [text.strip() for text in res['title']]
            all_topic = [text.strip() for text in res['topics']]

            res['topics'] = [' '.join(all_topic)]
            res['title'] = [' '.join(all_title)]
            res['text'] = [' '.join(all_text)]

            yield res
Exemplo n.º 18
0
class RussiaTodaySpider(NewsSpider):
    name = "rt"

    start_urls = ["https://russian.rt.com/sitemap.xml"]

    config = NewsSpiderConfig(
        title_path="//h1/text()",
        date_path='//meta[contains(@name, "mediator_published_time")]/@content'
        ' | //span[@class="main-page-heading__date"]/text()',
        date_format="%Y-%m-%dT%H:%M:%S",
        text_path='//div[contains(@class, "article__text")]'
        '//*[not(contains(@class, "read-more")) and '
        'not(contains(@class, "article__cover"))]//text()'
        ' | //meta[contains(@name, "description")]/@content'
        ' | //div[@class="page-content"]/p/text()'
        ' | //div[@class="page-content"]/blockquote/p/text()'
        ' | //div[@class="page-content"]/p/a/text()'
        ' | //div[@class="page-content"]/h2/strong/text()',
        topics_path='//meta[contains(@name, "mediator_theme")]/@content'
        ' | //h2[@class="main-page-heading__tag"]/text()',
        authors_path='//meta[contains(@name, "mediator_author")]/@content'
        ' | //span[@class="main-page-heading__author"]/text()',
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path="_",
    )

    def parse(self, response):
        """Parse first main sitemap.xml by initial parsing method.
        Getting sub_sitemaps.
        """
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()

        for link in links:
            yield Request(url=link, callback=self.parse_sitemap)

    def parse_sitemap(self, response):
        """Parse each sub_sitemap. There is no today's news.
        """
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        lm_datetimes = Selector(text=body).xpath("//lastmod/text()").getall()

        for i in range(len(links)):
            if "https://russian.rt.com/tag/" not in links[i]:
                if (datetime.strptime(lm_datetimes[i][:22] + "00",
                                      "%Y-%m-%dT%H:%M:%S%z").date() >=
                        self.until_date):
                    yield Request(url=links[i], callback=self.parse_document)

    def fix_date(self, raw_date):
        """Fix date for regular and authors articles
        """
        months_ru = [
            "января",
            "февраля",
            "марта",
            "апреля",
            "мая",
            "июня",
            "июля",
            "августа",
            "сентября",
            "октября",
            "ноября",
            "декабря",
        ]

        if len(raw_date[0]) == 25:
            raw_date[0] = raw_date[0][:19]
            return raw_date
        else:
            for i, month in enumerate(months_ru):
                raw_date[0] = raw_date[0].replace(month, str(i + 1))
            return datetime.strptime(raw_date[0],
                                     "%d %m %Y,").strftime("%Y-%m-%dT%H:%M:%S")

    def cut_instagram(self, raw_text):
        """Cut instagram quote
        """
        clear_text = []
        i = 0
        while i < len(raw_text):
            if " Посмотреть эту публикацию в Instagram" == raw_text[i]:

                while "PDT" not in raw_text[i]:
                    i += 1
                i += 1
            else:
                clear_text.append(raw_text[i])
                i += 1
        return clear_text

    def parse_document(self, response):
        """Final parsing method.
        Parse each article."""
        for item in super().parse_document(response):
            item["date"] = self.fix_date(item["date"])
            item["text"] = self.cut_instagram(item["text"])
            yield item
Exemplo n.º 19
0
class KommersantSpider(NewsSpider):
    name = 'kommersant'

    base_url = 'https://www.kommersant.ru'
    link_tmpl = 'https://www.kommersant.ru/archive/list/77/{}'
    # Start with the current date
    page_dt = datetime.now()
    start_urls = [link_tmpl.format(page_dt.strftime('%Y-%m-%d'))]

    # Ignore "robots.txt" for this spider only
    custom_settings = {'ROBOTSTXT_OBEY': 'False'}

    config = NewsSpiderConfig(
        title_path='(.//*[@class="article_name"])[1]//text()',
        date_path='//meta[contains(@property, "published_time")]/@content',
        date_format='%Y-%m-%dT%H:%M:%S%z',  # 2019-03-09T12:03:10+03:00
        text_path='//p[@class="b-article__text"]//text()',
        topics_path='//meta[contains(@name, "category")]/@content',
        authors_path='//p[contains(@class, "document_authors")]//text()')
    news_le = LinkExtractor(
        restrict_xpaths='//div[@class="archive_result__item_text"]')

    def parse(self, response):
        # Parse most recent news
        for i in self.news_le.extract_links(response):
            yield scrapy.Request(url=i.url,
                                 callback=self.parse_document,
                                 meta={'page_dt': self.page_dt})

        # If it's not the end of the page, request more news from archive by calling recursive "parse_page" function
        more_link = response.xpath(
            '//button[contains(@class, "lazyload-button")]/@data-lazyload-url'
        ).extract()
        if more_link:
            yield scrapy.Request(url='{}{}'.format(self.base_url,
                                                   more_link[0]),
                                 callback=self.parse_page,
                                 meta={'page_dt': self.page_dt})

        # Requesting the next page if we need to
        self.page_dt -= timedelta(days=1)
        if self.page_dt.date() >= self.until_date:
            link_url = self.link_tmpl.format(self.page_dt.strftime('%Y-%m-%d'))

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={
                                     'page_depth':
                                     response.meta.get('page_depth', 1) + 1,
                                     'page_dt':
                                     self.page_dt
                                 })

    def parse_page(self, response):
        # Parse all articles on page
        for i in self.news_le.extract_links(response):
            yield scrapy.Request(url=i.url, callback=self.parse_document)

        # Take a link from "more" button
        more_link = response.xpath(
            '//button[contains(@class, "lazyload-button")]/@data-lazyload-url'
        ).extract()
        if more_link:
            yield scrapy.Request(url='{}{}'.format(self.base_url,
                                                   more_link[0]),
                                 callback=self.parse_page,
                                 meta={
                                     'page_depth':
                                     response.meta.get('page_depth', 1),
                                     'page_dt':
                                     response.meta['page_dt']
                                 })

    def parse_document(self, response):
        for res in super().parse_document(response):
            # If it's a gallery (no text) or special project then don't return anything (have another html layout)
            if 'text' not in res or 'title' not in res:
                break

            # Remove ":" in timezone
            pub_dt = res['date'][0]
            res['date'] = [pub_dt[:-3] + pub_dt[-3:].replace(':', '')]

            yield res
Exemplo n.º 20
0
class GazetaSpider(NewsSpider):
    name = "gazeta"
    start_urls = ["https://www.gazeta.ru/sitemap.xml"]

    config = NewsSpiderConfig(
        title_path='//div[contains(@itemprop, "alternativeHeadline")]//text() | ' "//h1/text()",
        date_path='//time[contains(@itemprop, "datePublished")]/@datetime',
        date_format="%Y-%m-%dT%H:%M:%S%z",
        text_path='//div[contains(@itemprop, "articleBody")]//p//text() | '
        '//span[contains(@itemprop, "description")]//text()',
        topics_path='//div[contains(@class, "active")]/a/span/text()',
        authors_path='//span[contains(@itemprop, "author")]//text()',
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path="_",
    )

    def parse(self, response):
        # Parse main sitemap
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall()

        for link, last_modif_dt in zip(links, last_modif_dts):
            # Convert last_modif_dt to datetime
            last_modif_dt = datetime.strptime(last_modif_dt.replace(":", ""), "%Y-%m-%dT%H%M%S%z")

            if last_modif_dt.date() >= self.until_date:
                yield Request(url=link, callback=self.parse_sub_sitemap)

    def parse_sub_sitemap(self, response):
        # Parse sub sitemaps
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall()

        for link, last_modif_dt in zip(links, last_modif_dts):
            # Convert last_modif_dt to datetime
            last_modif_dt = datetime.strptime(last_modif_dt.replace(":", ""), "%Y-%m-%dT%H%M%S%z")

            if last_modif_dt.date() >= self.until_date:
                yield Request(url=link, callback=self.parse_articles_sitemap)

    def parse_articles_sitemap(self, response):
        # Parse sub sitemaps
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        last_modif_dts = Selector(text=body).xpath("//lastmod/text()").getall()

        for link, last_modif_dt in zip(links, last_modif_dts):
            # Convert last_modif_dt to datetime
            last_modif_dt = datetime.strptime(last_modif_dt.replace(":", ""), "%Y-%m-%dT%H%M%S%z")

            if last_modif_dt.date() >= self.until_date:
                if link.endswith(".shtml") and not link.endswith("index.shtml"):
                    yield Request(url=link, callback=self.parse_document)

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Remove advertisement blocks
            ad_parts = ("\nРеклама\n", "\n.AdCentre_new_adv", " AdfProxy.ssp", "\nset_resizeblock_handler")

            res["text"] = [x.replace("\n", "\\n") for x in res["text"] if x != "\n" and not x.startswith(ad_parts)]

            # Remove ":" in timezone
            pub_dt = res["date"][0]
            res["date"] = [pub_dt[:-3] + pub_dt[-3:].replace(":", "")]

            yield res
Exemplo n.º 21
0
class RussiaTodaySpider(NewsSpider):
    name = 'rt'

    start_urls = ['https://russian.rt.com/sitemap.xml']

    config = NewsSpiderConfig(
        title_path='//h1/text()',
        date_path='//meta'
        '[contains(@name, "mediator_published_time")]/@content',
        date_format="%Y-%m-%dT%H:%M:%S",
        text_path='//div[contains(@class, "article__text")]//text()',
        topics_path='//meta[contains(@name, "mediator_theme")]/@content',
        authors_path='_',
        reposts_fb_path='_',
        reposts_vk_path='_',
        reposts_ok_path='_',
        reposts_twi_path='_',
        reposts_lj_path='_',
        reposts_tg_path='_',
        likes_path='_',
        views_path='_',
        comm_count_path='_')

    def parse(self, response):
        """Parse first main sitemap.xml by initial parsing method.
        Getting sub_sitemaps.
        """
        body = response.body
        links = Selector(text=body).xpath('//loc/text()').getall()

        for link in links:
            yield Request(url=link, callback=self.parse_sitemap)

    def parse_sitemap(self, response):
        """Parse each sub_sitemap.
        """
        body = response.body
        links = Selector(text=body).xpath('//loc/text()').getall()

        for link in links:
            yield Request(url=link, callback=self.parse_document)

    def _fix_syntax(self, sample: List[str], idx_split: int) -> List[str]:
        """Fix timestamp syntax, droping timezone postfix.
        """
        sample = [sample[0][:idx_split]]
        return sample

    def _get_date(self, lst: List[str]):
        """Convert list into date obj.
        """
        y, m, d = [int(num) for num in lst]
        return date(y, m, d)

    def parse_document(self, response):
        """Final parsing method.
        Parse each article."""
        for item in super().parse_document(response):

            # Try to drop timezone postfix.
            try:
                item['date'] = self._fix_syntax(item['date'], -6)
            except KeyError:
                print('Error. No date value.')
            else:
                raw_date = item['date'][0][:10].split('-')
                processed_date = self._get_date(raw_date)

                if processed_date >= self.until_date:
                    yield item
Exemplo n.º 22
0
class RbcSpider(NewsSpider):
    name = "rbc"
    link_tmpl = "https://www.rbc.ru/v10/ajax/get-news-feed/project/rbcnews/lastDate/{}/limit/22"
    start_urls = [link_tmpl.format(int(time.time()))]
    config = NewsSpiderConfig(
        title_path='(.//span[contains(@class, "js-slide-title")])[1]//text()',
        date_path="_",
        date_format="%Y-%m-%d %H:%M:%S",
        text_path='(.//div[contains(@class, "article__text")])'
        '/*[not(self::script) and not(self::div[@class="subscribe-infographic"])]//text()',
        topics_path='(.//a[contains(@class, "article__header__category")])[1]//text()',
        authors_path='//div[contains(@class, "article__authors")]/text()',
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path="_",
        views_path="_",
        comm_count_path="_",
    )

    def parse(self, response):
        items = json.loads(response.body.decode("utf-8"))["items"]

        pub_dt = None
        for i in items:
            resp = HtmlResponse(url="", body=i["html"], encoding="utf8")

            link = resp.xpath("//a/@href").extract()[0]
            pub_dt = datetime.fromtimestamp(i["publish_date_t"])

            if pub_dt.date() >= self.until_date:
                yield scrapy.Request(url=link, callback=self.parse_document, meta={"pub_dt": pub_dt})

        # Requesting page if publication date of the last article is above "until_date"
        if pub_dt and pub_dt.date() >= self.until_date:
            # Forming the next page link
            link_url = self.link_tmpl.format(int(pub_dt.timestamp()))

            yield scrapy.Request(
                url=link_url,
                priority=100,
                callback=self.parse,
                meta={"page_depth": response.meta.get("page_depth", 1) + 1},
            )

    def parse_document(self, response):
        for res in super().parse_document(response):
            res["date"] = [response.meta["pub_dt"].strftime(self.config.date_format)]

            # If the article is located in "www.rbc.ru" url, then return it
            # (not "sportrbc.ru", "delovtb.rbc.ru" e t.c. because they have another html layout)
            if res["edition"][0] == "-":
                if "authors" in res:
                    res["authors"] = [
                        i.replace("\n", "").strip() for i in res["authors"] if i.replace("\n", "").strip()
                    ]
                res["text"] = [i.replace("\xa0", " ") for i in res["text"]]

                yield res
Exemplo n.º 23
0
class RiaSpider(NewsSpider):
    name = "ria"
    start_urls = ["https://www.ria.ru"]
    config = NewsSpiderConfig(
        title_path='//h1[contains(@class, "article__title")]/text()',
        date_path='//div[contains(@class, "endless__item")]/@data-published',
        date_format="%Y-%m-%dT%H:%M",
        text_path=
        '//div[contains(@class, "article__block") and @data-type = "text"]//text()',
        topics_path='//a[contains(@class, "article__tags-item")]/text()',
        authors_path="_",
        reposts_fb_path="_",
        reposts_vk_path="_",
        reposts_ok_path="_",
        reposts_twi_path="_",
        reposts_lj_path="_",
        reposts_tg_path="_",
        likes_path='//span[contains(@class,"m-value")]/text()',
        views_path='//span[contains(@class,"statistic__item m-views")]/text()',
        comm_count_path="_",
    )
    news_le = LinkExtractor(restrict_css="div.lenta__item")

    def parse(self, response):
        article_links = self.news_le.extract_links(response)

        last_link = ""
        for link in article_links:
            last_link = link.url

            yield scrapy.Request(url=link.url, callback=self.parse_document)

        dt = self._get_last_dt_on_page(last_link)

        if datetime.strptime(
                dt, self.config.date_format).date() >= self.until_date:
            # Getting and forming the next page link
            next_page_link = response.xpath(
                '//div[contains(@class, "lenta__item")]/@data-next').extract(
                )[0]
            link_url = "{}{}".format(self.start_urls[0], next_page_link)

            yield scrapy.Request(
                url=link_url,
                priority=100,
                callback=self.parse,
                meta={"page_depth": response.meta.get("page_depth", 1) + 1},
            )

    def parse_document(self, response):
        for res in super().parse_document(response):
            # Leave only the last tag
            # (the last tag is always a global website tag)
            res["topics"] = [res["topics"][-1]]

            yield res

    def _get_last_dt_on_page(self, link):
        r = requests.get(link)
        source_code = r.text

        root = lxml.html.fromstring(source_code)

        dt = root.xpath(self.config.date_path)[0]

        return dt
Exemplo n.º 24
0
class MeduzaSpider(NewsSpider):
    name = 'meduza'

    # Page link template
    page_link_tmpl = 'https://meduza.io/api/v3/search?chrono=news&page={}&per_page=24&locale=ru'
    # Article link template
    article_link_tmpl = 'https://meduza.io/api/w4/{}'
    # Start with the first page
    start_urls = [page_link_tmpl.format(0)]

    config = NewsSpiderConfig(
        title_path='_',
        date_path='_',
        date_format='%Y-%m-%d %H:%M:%S',
        text_path='_',
        topics_path='_',
        authors_path='_'
    )

    def parse(self, response):
        last_page = False

        jsonresponse = json.loads(response.body_as_unicode())

        # Getting article items
        articles = [content for _, content in jsonresponse['documents'].items()]
        # Sorting them from the most recent to the most late one
        articles = sorted(articles, key=lambda x: x['published_at'], reverse=True)

        # Filtering out late articles and checking if we have reached the "until_date"
        filtered_articles = []
        for content in articles:
            pub_date = datetime.strptime(content['pub_date'], '%Y-%m-%d').date()
            if pub_date >= self.until_date:
                filtered_articles.append(content)
            else:
                last_page = True

        # Iterating through news on this page
        for content in filtered_articles:
            full_url = self.article_link_tmpl.format(content['url'])

            yield scrapy.Request(url=full_url, callback=self.parse_document)

        # Requesting a new page if needed
        if not last_page and jsonresponse['has_next']:
            page_depth = response.meta.get('page_depth', 1)

            link_url = self.page_link_tmpl.format(page_depth)

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={'page_depth': page_depth + 1}
                                 )

    def parse_document(self, response):
        news_item = json.loads(response.body_as_unicode())['root']
        url = 'https://meduza.io/{}'.format(news_item['url'])

        # Taking all blocks from response with information
        blocks = self._get_text_blocks(news_item)

        # Extract text paragraphs from every block of the article
        text_paragraphs = self._extract_text_from_blocks(blocks)

        base_edition = urlsplit(self.start_urls[0])[1]
        edition = urlsplit(url)[1]

        # Replace every \xa0 with space
        text_paragraphs = [text.replace('\xa0', ' ') for text in text_paragraphs]
        title = news_item['title'].replace('\xa0', ' ')

        # Constructing the resulting item
        l = ItemLoader(item=Document(), response=response)
        l.add_value('url', url)
        l.add_value('edition', '-' if edition == base_edition else edition)
        l.add_value('title', title)
        l.add_value('topics', '')
        l.add_value('date', datetime.utcfromtimestamp(news_item['datetime']).strftime(self.config.date_format))
        l.add_value('text', text_paragraphs if text_paragraphs else [''])
        l.add_value('authors', news_item['source']['name'] if 'source' in news_item else [''])

        yield l.load_item()

    def _extract_text_from_blocks(self, blocks):
        text_paragraphs = []

        # Block types which contain text
        block_types = ['p', 'context_p', 'blockquote',
                       'image', 'h3', 'card_title', 'ul', 'lead']
        for block in blocks:
            if block['type'] in block_types:
                if block['type'] == 'image':
                    text_paragraphs.append(block['data'].get('caption', ''))
                elif block['type'] == 'card_title':
                    text_paragraphs.append(block['data'].get('text', ''))
                elif block['type'] == 'ul':
                    for one_elem in block['data']:
                        text_paragraphs.append(one_elem)
                else:
                    # Paragraphs can be empty (without text)
                    text_paragraphs.append(block.get('data', ''))

        return text_paragraphs

    def _get_text_blocks(self, news_item):
        blocks = []

        # Get all blocks with data depending on article type (news, slides, cards)
        if 'blocks' in news_item['content']:
            blocks = news_item['content']['blocks']
        elif 'slides' in news_item['content']:
            # Joining all slides into a list of blocks
            for one_slide in news_item['content']['slides']:
                for block in one_slide['blocks']:
                    blocks.append(block)
        elif 'cards' in news_item['content']:
            # Joining all cards into a list of blocks
            for one_slide in news_item['content']['cards']:
                for block in one_slide['blocks']:
                    blocks.append(block)

        return blocks