Exemplo n.º 1
0
    def parse_article(self, response):
        # get title
        title = response.css(
            'h1.gp-coluna.col-8.c-titulo ::text').extract_first()
        # get sub_title
        sub_title = response.css('h2.c-sumario ::text').extract_first()
        # get author
        author = response.css('li.c-autor span::text').extract_first()
        # get date
        date = self.format_date(
            response.css('li.data-publicacao time::text').extract_first())
        # get section
        section = response.css('a.c-nome-editoria span::text').extract_first()
        # get text
        text = ""
        for paragraph in response.css(
                'div.gp-coluna.col-6.texto-materia.paywall-google p::text'):
            text = (text + paragraph.extract())

        # get comments
        self.token = response.css(
            'div.sociabilizacao-load-area ::attr(data-token)').extract_first()
        link_comments = 'https://live.gazetadopovo.com.br/webservice/comentario/abaComentarios?comentario=&token=' + self.token
        yield response.follow(link_comments, self.parse_comments)

        article = CrawlerNewsItem(title=title,
                                  sub_title=sub_title,
                                  author=author,
                                  date=date,
                                  text=text,
                                  section=section,
                                  _id=response.request.url)

        yield article
Exemplo n.º 2
0
    def parse_article(self, response):
        # get title
        title = response.css('h1.article__title::text').extract_first()
        # get sub_title
        sub_title = response.css('h2.article__subtitle::text').extract_first()
        # get article's date
        date = self.format_date(str(response.css('div.article__date::text').extract_first()))
        # get author
        author = response.css('div.article__author::text').extract_first()
        # get text
        text = ""
        for paragraph in response.css('div.article__content-container.protected-content p::text'):
            text = (text + paragraph.extract())
        # get section
        section = response.css('div.site-header__section-name a::text').extract_first()
        # get id_article
        id_article = response.request.url.split('-')
        
        news = CrawlerNewsItem(
        _id=id_article[len(id_article)-1] ,title=title, sub_title=sub_title, date=date,
        author=author, text=text, section=section, url=response.request.url)

        yield news

        # get comments by json
        yield response.follow('https://oglobo.globo.com/ajax/comentario/buscar/' + id_article[len(id_article)-1] + '/1.json', self.parse_comments)
Exemplo n.º 3
0
    def parse_article(self, response):
        # get title
        title = response.css('h1.c-content-head__title::text').extract_first()
        # get sub_title
        sub_title = response.css(
            'h2.c-content-head__subtitle::text').extract_first()
        # get article's date transform date from isodate to timestamp
        date = dateutil.parser.parse(
            response.css('time.c-more-options__published-date::attr(datetime)'
                         ).extract_first()).strftime('%s')
        # get author
        author = response.css(
            'strong.c-signature__author::text').extract_first()
        # get text
        text = ""
        for paragraph in response.xpath(
                "//div[@class='c-news__body']/p//text()").extract():
            text = text + paragraph
        # get section
        section = response.css(
            'li.c-site-nav__item.c-site-nav__item--section a::text'
        ).extract_first()

        article = CrawlerNewsItem(_id=response.request.url,
                                  title=title,
                                  sub_title=sub_title,
                                  date=date,
                                  author=author,
                                  text=text,
                                  section=section,
                                  url=response.request.url)

        yield article
Exemplo n.º 4
0
    def parse_article(self, response):
        # get title
        title = response.css('h1.eltdf-title-text ::text').extract_first()
        # get sub_title
        sub_title = response.css('div.wpb_wrapper h3::text').extract_first()
        # get article's date
        date = self.format_date(
            response.css('div.eltdf-post-info-date.entry-date.updated a::text'
                         ).extract_first())
        # get author
        author = response.css(
            'a.eltdf-post-info-author-link ::text').extract_first()
        # get text
        text = ""
        for paragraph in response.css('div.eltdf-post-text p::text'):
            text = (text + paragraph.extract())
        # get section
        section = response.css(
            'div.eltdf-post-info-category a::text').extract_first()

        news = CrawlerNewsItem(title=title,
                               sub_title=sub_title,
                               date=date,
                               author=author,
                               text=text,
                               section=section,
                               _id=response.request.url)

        yield news
Exemplo n.º 5
0
    def parse_article(self, response):
        # get title
        title = response.css('h1.n--noticia__title::text').extract_first()
        # get sub_title
        sub_title = response.css(
            'h2.n--noticia__subtitle::text').extract_first()
        # get article's date
        dt_article = response.css(
            'div.n--noticia__state-desc p::text').extract_first()
        # transform article's date from isodate to timestamp
        dt_article = self.format_date(dt_article)
        # get article's section
        section = response.css(
            'div.header-current-page.cor-e a::text').extract_first()
        # get author
        author = response.css(
            'div.n--noticia__state-title::text').extract_first()
        # get text
        text_article = ""
        paragraph = ""
        for paragraph in response.css(
                'div.n--noticia__content.content p::text').extract():
            text_article = text_article + paragraph

        article = CrawlerNewsItem(_id=response.request.url,
                                  title=title,
                                  sub_title=sub_title,
                                  date=dt_article,
                                  text=text_article,
                                  section=section,
                                  url=response.request.url)

        yield article
Exemplo n.º 6
0
    def parse_article(self, response):
        # get title
        title = response.css('h1.articulo-titulo ::text').extract_first()
        # get sub_title
        sub_title = response.css(
            'h2.articulo-subtitulo ::text').extract_first()
        # get article's date
        date = dateutil.parser.parse(
            response.css('time.articulo-actualizado ::attr(datetime)').
            extract_first()).strftime(
                '%s')  # transform date from isodate to timestamp
        # get author
        author = response.css('span.autor-nombre a::text').extract_first()
        # get text
        text = ""
        for paragraph in response.css('div.articulo-cuerpo p::text'):
            text = (text + paragraph.extract())
        # get section
        section = response.css('a.enlace span::text').extract_first()

        news = CrawlerNewsItem(title=title,
                               sub_title=sub_title,
                               date=date,
                               author=author,
                               text=text,
                               section=section,
                               _id=response.request.url)

        yield news
Exemplo n.º 7
0
    def parse_news(self, response):
        item = CrawlerNewsItem()

        item['url'] = response.url
        item['article_from'] = self.name
        item['article_type'] = 'news'

        item['title'] = self._parse_title(response)
        item['publish_date'] = self._parse_publish_date(response)
        item['authors'] = self._parse_authors(response)
        item['tags'] = self._parse_tags(response)
        item['text'] = self._parse_text(response)
        item['text_html'] = self._parse_text_html(response)
        item['images'] = self._parse_images(response)
        item['video'] = self._parse_video(response)
        item['links'] = self._parse_links(response)

        return item
Exemplo n.º 8
0
    def parse_article(self, response):
        # get title
        title = response.css('h1.article-title::text').extract_first()
        # get sub_title
        sub_title = response.css('h2.article-subtitle::text').extract_first()
        # get article's date
        date = self.format_date(
            response.css('div.article-date span::text').extract_first())
        # get author
        author = response.css('div.article-author span::text').extract_first()
        # get text
        text = ""
        for paragraph in response.xpath(
                "//section[@class='article-content']/p//text()").extract():
            text = text + paragraph
        # get section
        section = response.css('div.article-category a::text').extract_first()

        news = CrawlerNewsItem(_id=response.request.url,
                               title=title,
                               sub_title=sub_title,
                               date=date,
                               author=author,
                               text=text,
                               section=section,
                               url=response.request.url)

        yield news

        # get comments
        for (text_comment, dt_comment, author_comment) in zip(
                response.css('div.comment-text p::text'),
                response.css('span.comment-meta.comment-metadata a::text'),
                response.css('div.comment-author.vcard cite::text')):
            comment = CrawlerNewsCommentItem(
                date=self.format_date(dt_comment.extract(
                )),  # transform comments' date from isodate to timestamp
                author=author_comment.extract(),
                text=text_comment.extract(),
                id_article=response.request.url)

            yield comment
Exemplo n.º 9
0
    def parse_article(self, response):
        # get title
        title = response.css('h1::text').extract_first()
        # get article's date
        dt_article = response.css(
            'time.entry-date.published::attr(datetime)').extract_first()
        # transform article's date from isodate to timestamp
        dt_article = dateutil.parser.parse(dt_article).strftime('%s')
        # get article's section
        section = response.css('span.categoria a::text').extract_first()
        # get text
        text_article = ""
        for paragraph in response.xpath(
                "//div[@class='entry-content']/p//text()").extract():
            text_article = text_article + paragraph

        article = CrawlerNewsItem(_id=response.request.url,
                                  title=title,
                                  date=dt_article,
                                  text=text_article,
                                  section=section)

        yield article

        # get comments
        for (text_comment, dt_comment, author_comment) in zip(
                response.css('div.comment-content p::text'),
                response.css('div.comment-metadata time::attr(datetime)'),
                response.css('div.comment-author.vcard b::text')):
            comment = CrawlerNewsCommentItem(
                date=dateutil.parser.parse(dt_comment.extract()).strftime(
                    '%s'
                ),  # transform comments' date from isodate to timestamp
                author=author_comment.extract(),
                text=text_comment.extract(),
                id_article=response.request.url)

            yield comment