예제 #1
0
    def parse(self, response):
        for article in response.xpath(
                "//div[@id='listaItems']/div[@class='todas-noticias grid-8']/div[@class='artigos2']/article"
        ):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//h2/a/text()").extract_first().strip()
            item['category'] = article.xpath(
                ".//div[@class='categoria']/span/text()").extract_first(
                ).strip()
            item['link'] = article.xpath(
                ".//h2/a/@href").extract_first().strip()
            item['headline'] = article.xpath(
                ".//p/text()").extract_first().strip()
            item['date'] = article.xpath(
                ".//div[@class='categoria']/span[@class='data']/text()"
            ).extract_first().strip()

            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        next_page = response.xpath(
            '//li[@class="next"]/a/@href').extract_first().strip()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
예제 #2
0
 def parse(self, response):
     for article in response.xpath("//div[@id='content-core'"):
         item = NewsBotItem()
         item['title'] = article.xpath(".//div/h2/a/text()").extract_first()
         item['link'] = article.xpath(".//div/h2/a/@href").extract_first()
         item['headline'] = article.xpath(
             ".//div/p/span/text()").extract_first().strip()
         yield item
예제 #3
0
    def parse(self, response):
        for article in response.xpath("//ul[@class='articles-list']/li"):
            item = NewsBotItem()
            item['title'] = article.xpath(".//div/span/a/text()").extract_first()
            item['category'] = article.xpath(".//div/div[1]/a/text()").extract_first()
            item['link'] = article.xpath(".//div/span/a/@href").extract_first()
            #item['headline'] = article.xpath(".//p/text()").extract_first().strip()
            item['date'] = article.xpath(".//div/div[2]/span/text()").extract_first()

            request = scrapy.Request(item['link'], callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request
예제 #4
0
    def parse(self, response):
        for article in response.xpath(
                '//div[@class="destaques_templates"]/div[@class="obj_contato_texto"]'
        ):
            item = NewsBotItem()
            item['title'] = article.xpath(
                './div[2]/a/text()').extract_first().strip()
            item['link'] = "http://www.stj.jus.br" + article.xpath(
                './div[2]/a/@href').extract_first().strip()

            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request
예제 #5
0
    def parse(self, response):
        item = NewsBotItem()
        item['title'] = response.xpath('//h3/a/text()').extract_first().strip()
        item['headline'] = response.xpath("//h4/text()").extract_first().strip()
        item['link'] = response.xpath(".//h3/a/@href").extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        for article in response.xpath('//*[@width="445px"]'):
            item = NewsBotItem()
            item['title'] = article.xpath('.//span[2]/a/text()').extract_first().strip()
            item['date'] = article.xpath('.//span/text()').extract_first().strip()
            item['link'] = "http://www.stf.jus.br/portal/cms/" + article.xpath(".//span[2]/a[@class='noticia']/@href").extract_first().strip()
            request = scrapy.Request(item['link'], callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        next_page = response.xpath('//*[@style="text-align:right; border-bottom:1px solid #DFE8ED; width:71%;"]/a/@href').extract_first().strip()
        if next_page is not None:
            next_page = "http://www.stf.jus.br/portal/cms/" + next_page
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse_page)
예제 #6
0
    def parse(self, response):
        for article in response.xpath("//div[@class='tileItem visualIEFloatFix tile-collective-nitf-content']"):
            item = NewsBotItem()
            item['title'] = article.xpath(".//h2/a/text()").extract_first()
            item['headline'] = article.xpath(".//p/span/text()").extract_first().strip()
            item['category'] = article.xpath(".//span[@class='subtitle']/text()").extract_first()
            item['link'] = article.xpath(".//h2/a/@href").extract_first().strip()
            item['date'] = article.xpath(".//span/span[3]/text()[2]").extract()+article.xpath(".//span/span[4]/text()[2]").extract()

            request = scrapy.Request(item['link'], callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        next_page = response.xpath('//a[@class="proximo"]/@href').extract_first().strip()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
예제 #7
0
파일: mpf.py 프로젝트: vitorarins/news-bot
    def parse(self, response):
        for article in response.xpath(
                "///div[@id='listaItems']/div[@class='todas-noticias grid-8']/div[@class='artigos2']/article"
        ):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//h2/a/text()").extract_first().strip()
            item['link'] = article.xpath(".//h2/a/@href").extract_first()
            item['headline'] = article.xpath(
                ".//p/text()").extract_first().strip()
            item['category'] = article.xpath(
                ".//div[@class='categoria']/span[1]/text()").extract_first()
            yield item

        next_page = response.xpath(
            "//div[@id='paginacao-mpf']/div[@class='footer-resultado']/ol/li[@class='next']/a/@href"
        ).extract_first()
        if next_page:
            self.logger.debug("Went to next page")
            yield scrapy.Request(response.urljoin(next_page),
                                 callback=self.parse)
예제 #8
0
    def parse(self, response):
        for article in response.xpath("//ol[@class='unstyled']/li"):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//a/text()").extract_first().strip()
            #item['category'] = article.xpath(".//span/text()").extract_first().strip()
            item['link'] = article.xpath(".//a/@href").extract_first().strip()
            #item['headline'] = article.xpath(".//p/text()").extract_first().strip()
            item['date'] = article.xpath(
                ".//a/span/text()").extract_first().strip()

            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        next_page = response.xpath(
            '//li[@class="next"]/a/@href').extract_first().strip()
        if next_page is not None:
            next_page = "http://www1.folha.uol.com.br/ultimas-noticias/" + next_page
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
예제 #9
0
    def parse(self, response):
        description = response.xpath(
            "//table[@class='itemlist']/tr[not(re:test(@class, "
            "'(spacer)'))]").extract()
        row = self.get_default_row_dict()
        # print description
        for i, v in enumerate(description):
            index = i
            if not row['rank']:
                value = Selector(text=v).xpath(
                    '//td[1]/span[@class="rank"]/text()').extract_first()
                row['rank'] = int(value.replace('.', '')) if value else 0

            if not row['story_text']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/text()').extract_first()
                row['story_text'] = value.encode("utf8") if value else ''

            if not row['link_href']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/@href').extract_first()
                # print value
                row['link_href'] = value if value else ''

            if not row['hn_user']:
                value = Selector(text=v).xpath(
                    '//a[@class="hnuser"]/text()').extract_first()
                row['hn_user'] = value.encode("utf8") if value else ''

            if not row['age']:
                value = Selector(text=v).xpath(
                    '//span[@class="age"]/a/text()').extract_first()
                row['age'] = int(value.split(' ')[0]) if value else 0

            if not row['total_comments']:
                value = Selector(text=v).xpath(
                    '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()'
                ).extract_first()
                if value:
                    value = value.encode('ascii', 'ignore').replace(
                        'comments', '') if value else ''
                    value = value.encode('ascii', 'ignore').replace(
                        'comment', '') if value else ''
                    row['total_comments'] = int(value) if represents_int(
                        value) else 0

            if not row['score']:
                value = Selector(text=v).xpath(
                    '//span[@class="score"]/text()').extract_first()
                row['score'] = int(value.split(' ')[0]) if value else 0

            if not row['hn_id_code']:
                value = Selector(
                    text=v).xpath('//tr[@class="athing"]/@id').extract_first()
                row['hn_id_code'] = int(value) if represents_int(value) else 0

            if all([None for i, v in row.items() if v == None]):
                print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
                data = row.copy()
                row = self.get_default_row_dict()
                self.comment_url.append(
                    'https://news.ycombinator.com/item?id=15318440')
                news_id = data['hn_id_code']
                item = NewsBotItem(data)
                yield item
                request = scrapy.Request(
                    url='https://news.ycombinator.com/item?id=' + str(news_id),
                    callback=self.parse_comment)
                request.meta['item'] = item
                request.meta['news_id'] = int(news_id)
                yield request

            if index % 2:
                row = self.get_default_row_dict()
예제 #10
0
    def parse(self, response):
        for article in response.xpath('//*[@id="afp_tab_content_57"]/div'):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//div/h4/a/text()").extract_first().strip()
            item['category'] = article.xpath(
                '//*[@id="afp_tab_57"]/a/text()').extract_first().strip()
            item['link'] = "https://www.afp.com" + article.xpath(
                ".//div/h4/a/@href").extract_first().strip()
            item['headline'] = article.xpath(
                ".//div/div/p/text()").extract_first().strip().strip()
            item['date'] = article.xpath(
                ".//div/span/text()").extract_first().strip()

            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        for article in response.xpath('//*[@id="afp_tab_content_2519"]/div'):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//div/h4/a/text()").extract_first().strip()
            item['category'] = article.xpath(
                '//*[@id="afp_tab_2519"]/a/text()').extract_first().strip()
            item['link'] = "https://www.afp.com" + article.xpath(
                ".//div/h4/a/@href").extract_first().strip()
            item['headline'] = article.xpath(
                ".//div/div/p/text()").extract_first().strip().strip()
            item['date'] = article.xpath(
                ".//div/span/text()").extract_first().strip()
            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        for article in response.xpath('//*[@id="afp_tab_content_58"]/div'):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//div/h4/a/text()").extract_first().strip()
            item['category'] = article.xpath(
                '//*[@id="afp_tab_58"]/a/text()').extract_first().strip()
            item['link'] = "https://www.afp.com" + article.xpath(
                ".//div/h4/a/@href").extract_first().strip()
            item['headline'] = article.xpath(
                ".//div/div/p/text()").extract_first().strip().strip()
            item['date'] = article.xpath(
                ".//div/span/text()").extract_first().strip()
            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request

        for article in response.xpath('//*[@id="afp_tab_content_2520"]/div'):
            item = NewsBotItem()
            item['title'] = article.xpath(
                ".//div/h4/a/text()").extract_first().strip()
            item['category'] = article.xpath(
                '//*[@id="afp_tab_2520"]/a/text()').extract_first().strip()
            item['link'] = "https://www.afp.com" + article.xpath(
                ".//div/h4/a/@href").extract_first().strip()
            item['headline'] = article.xpath(
                ".//div/div/p/text()").extract_first().strip().strip()
            item['date'] = article.xpath(
                ".//div/span/text()").extract_first().strip()
            request = scrapy.Request(item['link'],
                                     callback=self.parse_linkpage)
            request.meta['item'] = item
            yield request
예제 #11
0
    def parse(self, response):
        item = NewsBotItem()
        item['title'] = response.xpath(
            "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/h4/a/text()"
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/h1/text()').extract_first().strip(
            ) + "?sp=true"
        item['link'] = "http://br.reuters.com" + response.xpath(
            "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/h4/a/@href"
        ).extract_first().strip()
        item['headline'] = response.xpath(
            "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/p/text()"
        ).extract_first().strip()
        item['date'] = response.xpath(
            "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/h4/span/text()"
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        item = NewsBotItem()
        item['title'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/h5/a/text()'
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/h1/text()').extract_first().strip()
        item['link'] = "http://br.reuters.com" + response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/h5/a/@href'
        ).extract_first().strip() + "?sp=true"
        item['headline'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/p/text()'
        ).extract_first().strip()
        item['date'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/h5/span/text()'
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        item = NewsBotItem()
        item['title'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()'
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[1]/h3/a/text()'
        ).extract_first().strip()
        item['link'] = "http://br.reuters.com" + response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href'
        ).extract_first().strip() + "?sp=true"
        item['headline'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()'
        ).extract_first().strip()
        item['date'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()'
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        item = NewsBotItem()
        item['title'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()'
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[1]/h3/a/text()'
        ).extract_first().strip()
        item['link'] = "http://br.reuters.com" + response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href'
        ).extract_first().strip() + "?sp=true"
        item['headline'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()'
        ).extract_first().strip()
        item['date'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()'
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        item = NewsBotItem()
        item['title'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()'
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[1]/h3/a/text()'
        ).extract_first().strip()
        item['link'] = "http://br.reuters.com" + response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href'
        ).extract_first().strip() + "?sp=true"
        item['headline'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()'
        ).extract_first().strip()
        item['date'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()'
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        item = NewsBotItem()
        item['title'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()'
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[1]/h3/a/text()'
        ).extract_first().strip()
        item['link'] = "http://br.reuters.com" + response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href'
        ).extract_first().strip() + "?sp=true"
        item['headline'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()'
        ).extract_first().strip()
        item['date'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()'
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request

        item = NewsBotItem()
        item['title'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()'
        ).extract_first().strip()
        item['category'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[1]/h3/a/text()'
        ).extract_first().strip()
        item['link'] = "http://br.reuters.com" + response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href'
        ).extract_first().strip() + "?sp=true"
        item['headline'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()'
        ).extract_first().strip()
        item['date'] = response.xpath(
            '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()'
        ).extract_first().strip()
        request = scrapy.Request(item['link'], callback=self.parse_linkpage)
        request.meta['item'] = item
        yield request