예제 #1
0
 def get_details(self, response):
     """
     Dada uma response de uma página de nóticias retorna um item 
     contendo todos os dados solicitados no lab
     """
     item = RiLab01Item()
     title = response.css('h1::text').get(default='').strip()
     paragraphs = response.css('p').xpath('string()').getall()
     sub_title = paragraphs[2]
     try:
         author = response.css('strong').xpath('string()').get().replace("-", "").strip()
     except AttributeError:
         author = paragraphs[5].split("-")[0].strip()
     date = paragraphs[0]
     hour = response.css('p.meta::text').get().strip().split()[-1]
     section = response.css('body::attr(id)').get().split("-")[-1]
     text = paragraphs[5:]
     text.pop()
     text = self.text_formater(text)
     item['title'] = title
     item['sub_title'] = sub_title
     item['author'] = author
     item['date'] = self.date_formater(date, hour)
     item['section'] = section
     item['text'] = text
     item['url'] = response.url
     #self.writer_data(item)
     #save = [item['title'], item['sub_title'], item['author'], item['date'], item['section'], item['text'], item['url']]
     #print(','.join(save))
     yield item
예제 #2
0
 def parse(self, response):
     links = []
     newLinks = []
     if response.url in self.start_urls:
         # NAV
         links = response.css('h3.td-module-title a::attr(href)').getall()
         for link in links:
             yield response.follow(link, callback=self.parse)
     else:
         # PARSE
         item = RiLab01Item()
         item['_id'] = self.id
         item['author'] = response.css('div.td-post-author-name a::text').get()
         item['date'] = response.css('span.td-post-date time::attr(datetime)').get()
         item['title'] = response.css('title::text').get()
         item['sub_title'] = 'N/A'
         item['section'] = self.getSection(response)
         item['text'] = self.getText(response)
         item['url'] = response.url
         self.id = self.id + 1
         yield item
         if self.id < 200:
             # NAV
             link = response.css('div.td-post-next-prev-content a::attr(href)').get()
             yield response.follow(link, callback=self.parse)
    def create_news_item(self, response):
        date = response.css('span.td-post-date time::attr(datetime)').get()
        news_date = self.get_datetime(date)
        threshold_date = datetime.datetime(2018, 1, 1)

        # checking if the news is up to date.
        if news_date > threshold_date:
            url = response.url
            news_date = self.format_date(news_date)
            title = response.css('header h1::text').get()
            author = response.css('div.td-post-author-name a::text').get()
            category = response.meta.get('category')
            text = response.css('div.td-post-content span.s1::text').getall()

            # trying to get the news text from a different tag.
            if not text:
                text = response.css('div.td-post-content p::text').getall()
                donation_paragraph = response.css(
                    'p.donation_paragraph::text').get()

                if text[-1] == donation_paragraph:
                    # remove donation paragraph.
                    del text[-1]

            # NOTE: we don't have the sub_title information in the news.
            return RiLab01Item(title=title,
                               author=author,
                               date=news_date,
                               text=text,
                               url=url,
                               section=category)
예제 #4
0
    def parse_per_page(self, response):
        subtitle = response.css("div.c-overhead ::text").extract_first()
        title = response.css("h1.c-title ::text").extract_first().encode(
            "utf-8")
        date = response.css(
            "div.c-credits li:nth-child(3) ::text").extract_first()
        author = response.css(
            "div.c-credits li:nth-child(1) ::text").extract_first()
        section = response.css("li.c-title-content a ::text").extract_first()

        url = response.url
        if date is None or date[0] != "[":
            date = response.css(
                "div.c-credits li:nth-child(2) ::text").extract_first()

        date = datetime.strptime(str(date[1:11]), '%d/%m/%Y')

        all_p = response.css("div.paywall-google p ::text")
        text = ""
        for p in all_p:
            text += p.extract().encode("utf-8")
        text = text.replace(',', '')
        itemLab = RiLab01Item(title=title,
                              author=author,
                              url=url,
                              sub_title=subtitle.capitalize(),
                              date=date,
                              section=section,
                              text=text)
        yield itemLab
예제 #5
0
    def parse_news_page(self, response):
        """Parse the news page and extract text, title, subtitle, author,
        section(the section is part of url), date and url data using loader object,
        check more in documentation(https://docs.scrapy.org/en/latest/topics/loaders.html)

        :param response: the html response from scrapy download
        :return scrapy.Item: a iterable list of items extracted
        """
        loader = RiLab01Loader(item=RiLab01Item(), response=response)
        url = response.url

        loader.add_value('_id', response.meta.get('page_count'))
        loader.add_css('title', '.c-titulo::text')
        loader.add_css('title', '.c-title::text')
        loader.add_css('sub_title', '.c-sobretitulo span::text')
        loader.add_css('sub_title', 'c-overhead span::text')
        loader.add_css('author', '[class*="autor"] span::text')
        loader.add_css('author', '.item-agency::text')
        loader.add_css('author', '.item-name span::text')
        loader.add_css('date', '.data-publicacao time::text')
        loader.add_value('section', url.split('/')[3])
        loader.add_css('text', '.paywall-google > p::text')
        loader.add_value('url', url)
        write_in_frontier(loader)

        return loader.load_item()
예제 #6
0
    def parse_detalhe_materia(self, response):
        item = RiLab01Item()

        item['author'] = self.formata_autor(response.css('section p strong::text, strong a::text').get())
        item['title'] = response.css('h1::text').get()
        item['sub_title'] = response.xpath('//p[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]/text()').get()
        item['date'] = self.formata_data(
            response.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "meta", " " ))]/text()').get())
        item['section'] = response.url.split('/')[5]
        item['text'] = self.formata_texto(
            response.css('.entry p::text, p span::text, p a::text, entry span::text, strong::text').getall(), response.css('section p strong::text, strong a::text').get())
        item['url'] = response.url

        yield item
예제 #7
0
    def extract_data(self, response):
        output = RiLab01Item()
        self.id += 1
        self.log('\n\n\n\nID: %s\n' % self.id)

        output['_id'] = str(self.id)
        output['title'] = self.get_tile(response)
        output['sub_title'] = self.get_desc(response)
        output['author'] = self.get_author(response)
        output['date'] = self.get_date(response)
        output['section'] = self.get_section(response)
        output['text'] = self.get_text(response)
        output['url'] = response.url

        yield output
예제 #8
0
    def _br_247_callback(self, response):
        output = RiLab01Item()

        output['_id'] = self.current_id
        output['title'] = response.css('h1::text').get().replace('\n', '')
        output['sub_title'] = response.xpath(
            '//p[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]/text()'
        ).get().replace('\n', '')
        output['author'] = self._get_author(
            response.css('section p strong::text, strong a::text').get())
        output['date'] = response.xpath(
            '//*[contains(concat( " ", @class, " " ), concat( " ", "meta", " " ))]/text()'
        ).get()
        output['section'] = response.url.split('/')[5]
        output['text'] = response.css(
            '.entry p::text, p span::text, p a::text, entry span::text, strong::text'
        ).getall().replace('\n', '')
        output['url'] = response.url
        self.current_id += 1

        yield output
예제 #9
0
    def parseNoticia(self, response):
        if (self.permitsCrawl(response.url)):
            links = response.xpath('//a/@href').getall(
            )  # Links que ja estao 'dentro' da pagina de noticias:
            titulo_noticia = response.xpath('//h1/text()').get()
            with open('frontier/diariodocentrodomundo.json', 'r') as frontier:
                frontier_data = json.load(
                    frontier
                )  # Pega para gerar o id da noticia -> Ordem em que foi add no frontier
            noticia_loader = ItemLoader(item=RiLab01Item(), response=response)
            noticia_loader.add_value('_id', len(frontier_data) + 1)
            noticia_loader.add_xpath('title', '//h1/text()')
            noticia_loader.add_value(
                'sub_title',
                'Noticias não tem subtitle no Diario do Centro do Mundo')
            noticia_loader.add_xpath(
                'author', '//div[@class="td-post-author-name"]/a/text()')
            date = response.xpath('//time/@datetime').get()
            noticia_loader.add_value('date', date)
            noticia_loader.add_value('section', 'Not specified on page!')
            noticia_loader.add_xpath(
                'text',
                '//div[@class="td-post-content td-pb-padding-side"]/p/text()')
            noticia_loader.add_value('url', response.url)
            item = noticia_loader.load_item()

            yield item

            with open('frontier/diariodocentrodomundo.json', 'r') as frontier:
                frontier_data = json.load(frontier)
            frontier_data[
                titulo_noticia] = response.url  ##Adiciona link da noticia no frontier
            with open('frontier/diariodocentrodomundo.json', 'w') as frontier:
                json.dump(frontier_data, frontier)

            for link in links:
                if (self.permitsCrawl(link)):
                    yield scrapy.Request(
                        link, self.parseNoticia
                    )  ##Chama a funcao de forma recursiva para fazer o
예제 #10
0
    def parse_article_detail(self, response):
        """
        Crawls article and get informations from it

        :param response: HTML code of article page
        :return: Item to include in CSV
        """
        item = RiLab01Item()

        item['title'] = response.css('h1::text').get()

        item['sub_title'] = response.xpath(
            '//p[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]/text()'
        ).get()

        formatted_author = self.format_author(
            response.css('section p strong::text, strong a::text').get())
        item['author'] = formatted_author

        formatted_date = self.format_date(
            response.xpath(
                '//*[contains(concat( " ", @class, " " ), concat( " ", "meta", " " ))]/text()'
            ).get())
        item['date'] = formatted_date

        item['section'] = response.url.split('/')[5]

        formatted_text = self.format_text(
            response.css(
                '.entry p::text, p span::text, p a::text, entry span::text, strong::text'
            ).getall())
        item['text'] = formatted_text

        item['url'] = response.url

        yield item