def parse(self, response): for article in response.xpath( "//div[@id='listaItems']/div[@class='todas-noticias grid-8']/div[@class='artigos2']/article" ): item = NewsBotItem() item['title'] = article.xpath( ".//h2/a/text()").extract_first().strip() item['category'] = article.xpath( ".//div[@class='categoria']/span/text()").extract_first( ).strip() item['link'] = article.xpath( ".//h2/a/@href").extract_first().strip() item['headline'] = article.xpath( ".//p/text()").extract_first().strip() item['date'] = article.xpath( ".//div[@class='categoria']/span[@class='data']/text()" ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request next_page = response.xpath( '//li[@class="next"]/a/@href').extract_first().strip() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): for article in response.xpath("//div[@id='content-core'"): item = NewsBotItem() item['title'] = article.xpath(".//div/h2/a/text()").extract_first() item['link'] = article.xpath(".//div/h2/a/@href").extract_first() item['headline'] = article.xpath( ".//div/p/span/text()").extract_first().strip() yield item
def parse(self, response): for article in response.xpath("//ul[@class='articles-list']/li"): item = NewsBotItem() item['title'] = article.xpath(".//div/span/a/text()").extract_first() item['category'] = article.xpath(".//div/div[1]/a/text()").extract_first() item['link'] = article.xpath(".//div/span/a/@href").extract_first() #item['headline'] = article.xpath(".//p/text()").extract_first().strip() item['date'] = article.xpath(".//div/div[2]/span/text()").extract_first() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request
def parse(self, response): for article in response.xpath( '//div[@class="destaques_templates"]/div[@class="obj_contato_texto"]' ): item = NewsBotItem() item['title'] = article.xpath( './div[2]/a/text()').extract_first().strip() item['link'] = "http://www.stj.jus.br" + article.xpath( './div[2]/a/@href').extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request
def parse(self, response): item = NewsBotItem() item['title'] = response.xpath('//h3/a/text()').extract_first().strip() item['headline'] = response.xpath("//h4/text()").extract_first().strip() item['link'] = response.xpath(".//h3/a/@href").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request for article in response.xpath('//*[@width="445px"]'): item = NewsBotItem() item['title'] = article.xpath('.//span[2]/a/text()').extract_first().strip() item['date'] = article.xpath('.//span/text()').extract_first().strip() item['link'] = "http://www.stf.jus.br/portal/cms/" + article.xpath(".//span[2]/a[@class='noticia']/@href").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request next_page = response.xpath('//*[@style="text-align:right; border-bottom:1px solid #DFE8ED; width:71%;"]/a/@href').extract_first().strip() if next_page is not None: next_page = "http://www.stf.jus.br/portal/cms/" + next_page next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse_page)
def parse(self, response): for article in response.xpath("//div[@class='tileItem visualIEFloatFix tile-collective-nitf-content']"): item = NewsBotItem() item['title'] = article.xpath(".//h2/a/text()").extract_first() item['headline'] = article.xpath(".//p/span/text()").extract_first().strip() item['category'] = article.xpath(".//span[@class='subtitle']/text()").extract_first() item['link'] = article.xpath(".//h2/a/@href").extract_first().strip() item['date'] = article.xpath(".//span/span[3]/text()[2]").extract()+article.xpath(".//span/span[4]/text()[2]").extract() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request next_page = response.xpath('//a[@class="proximo"]/@href').extract_first().strip() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): for article in response.xpath( "///div[@id='listaItems']/div[@class='todas-noticias grid-8']/div[@class='artigos2']/article" ): item = NewsBotItem() item['title'] = article.xpath( ".//h2/a/text()").extract_first().strip() item['link'] = article.xpath(".//h2/a/@href").extract_first() item['headline'] = article.xpath( ".//p/text()").extract_first().strip() item['category'] = article.xpath( ".//div[@class='categoria']/span[1]/text()").extract_first() yield item next_page = response.xpath( "//div[@id='paginacao-mpf']/div[@class='footer-resultado']/ol/li[@class='next']/a/@href" ).extract_first() if next_page: self.logger.debug("Went to next page") yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
def parse(self, response): for article in response.xpath("//ol[@class='unstyled']/li"): item = NewsBotItem() item['title'] = article.xpath( ".//a/text()").extract_first().strip() #item['category'] = article.xpath(".//span/text()").extract_first().strip() item['link'] = article.xpath(".//a/@href").extract_first().strip() #item['headline'] = article.xpath(".//p/text()").extract_first().strip() item['date'] = article.xpath( ".//a/span/text()").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request next_page = response.xpath( '//li[@class="next"]/a/@href').extract_first().strip() if next_page is not None: next_page = "http://www1.folha.uol.com.br/ultimas-noticias/" + next_page next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): description = response.xpath( "//table[@class='itemlist']/tr[not(re:test(@class, " "'(spacer)'))]").extract() row = self.get_default_row_dict() # print description for i, v in enumerate(description): index = i if not row['rank']: value = Selector(text=v).xpath( '//td[1]/span[@class="rank"]/text()').extract_first() row['rank'] = int(value.replace('.', '')) if value else 0 if not row['story_text']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/text()').extract_first() row['story_text'] = value.encode("utf8") if value else '' if not row['link_href']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/@href').extract_first() # print value row['link_href'] = value if value else '' if not row['hn_user']: value = Selector(text=v).xpath( '//a[@class="hnuser"]/text()').extract_first() row['hn_user'] = value.encode("utf8") if value else '' if not row['age']: value = Selector(text=v).xpath( '//span[@class="age"]/a/text()').extract_first() row['age'] = int(value.split(' ')[0]) if value else 0 if not row['total_comments']: value = Selector(text=v).xpath( '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()' ).extract_first() if value: value = value.encode('ascii', 'ignore').replace( 'comments', '') if value else '' value = value.encode('ascii', 'ignore').replace( 'comment', '') if value else '' row['total_comments'] = int(value) if represents_int( value) else 0 if not row['score']: value = Selector(text=v).xpath( '//span[@class="score"]/text()').extract_first() row['score'] = int(value.split(' ')[0]) if value else 0 if not row['hn_id_code']: value = Selector( text=v).xpath('//tr[@class="athing"]/@id').extract_first() row['hn_id_code'] = int(value) if represents_int(value) else 0 if all([None for i, v in row.items() if v == None]): print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' data = row.copy() row = self.get_default_row_dict() self.comment_url.append( 'https://news.ycombinator.com/item?id=15318440') news_id = data['hn_id_code'] item = NewsBotItem(data) yield item request = scrapy.Request( url='https://news.ycombinator.com/item?id=' + str(news_id), callback=self.parse_comment) request.meta['item'] = item request.meta['news_id'] = int(news_id) yield request if index % 2: row = self.get_default_row_dict()
def parse(self, response): for article in response.xpath('//*[@id="afp_tab_content_57"]/div'): item = NewsBotItem() item['title'] = article.xpath( ".//div/h4/a/text()").extract_first().strip() item['category'] = article.xpath( '//*[@id="afp_tab_57"]/a/text()').extract_first().strip() item['link'] = "https://www.afp.com" + article.xpath( ".//div/h4/a/@href").extract_first().strip() item['headline'] = article.xpath( ".//div/div/p/text()").extract_first().strip().strip() item['date'] = article.xpath( ".//div/span/text()").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request for article in response.xpath('//*[@id="afp_tab_content_2519"]/div'): item = NewsBotItem() item['title'] = article.xpath( ".//div/h4/a/text()").extract_first().strip() item['category'] = article.xpath( '//*[@id="afp_tab_2519"]/a/text()').extract_first().strip() item['link'] = "https://www.afp.com" + article.xpath( ".//div/h4/a/@href").extract_first().strip() item['headline'] = article.xpath( ".//div/div/p/text()").extract_first().strip().strip() item['date'] = article.xpath( ".//div/span/text()").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request for article in response.xpath('//*[@id="afp_tab_content_58"]/div'): item = NewsBotItem() item['title'] = article.xpath( ".//div/h4/a/text()").extract_first().strip() item['category'] = article.xpath( '//*[@id="afp_tab_58"]/a/text()').extract_first().strip() item['link'] = "https://www.afp.com" + article.xpath( ".//div/h4/a/@href").extract_first().strip() item['headline'] = article.xpath( ".//div/div/p/text()").extract_first().strip().strip() item['date'] = article.xpath( ".//div/span/text()").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request for article in response.xpath('//*[@id="afp_tab_content_2520"]/div'): item = NewsBotItem() item['title'] = article.xpath( ".//div/h4/a/text()").extract_first().strip() item['category'] = article.xpath( '//*[@id="afp_tab_2520"]/a/text()').extract_first().strip() item['link'] = "https://www.afp.com" + article.xpath( ".//div/h4/a/@href").extract_first().strip() item['headline'] = article.xpath( ".//div/div/p/text()").extract_first().strip().strip() item['date'] = article.xpath( ".//div/span/text()").extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request
def parse(self, response): item = NewsBotItem() item['title'] = response.xpath( "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/h4/a/text()" ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/h1/text()').extract_first().strip( ) + "?sp=true" item['link'] = "http://br.reuters.com" + response.xpath( "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/h4/a/@href" ).extract_first().strip() item['headline'] = response.xpath( "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/p/text()" ).extract_first().strip() item['date'] = response.xpath( "//*[@id='maincontent']/div[2]/div[2]/div[1]/div[1]/div/div/h4/span/text()" ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request item = NewsBotItem() item['title'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/h5/a/text()' ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/h1/text()').extract_first().strip() item['link'] = "http://br.reuters.com" + response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/h5/a/@href' ).extract_first().strip() + "?sp=true" item['headline'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/p/text()' ).extract_first().strip() item['date'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[3]/div/div/div/h5/span/text()' ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request item = NewsBotItem() item['title'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()' ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[1]/h3/a/text()' ).extract_first().strip() item['link'] = "http://br.reuters.com" + response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href' ).extract_first().strip() + "?sp=true" item['headline'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()' ).extract_first().strip() item['date'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[6]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()' ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request item = NewsBotItem() item['title'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()' ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[1]/h3/a/text()' ).extract_first().strip() item['link'] = "http://br.reuters.com" + response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href' ).extract_first().strip() + "?sp=true" item['headline'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()' ).extract_first().strip() item['date'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[7]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()' ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request item = NewsBotItem() item['title'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()' ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[1]/h3/a/text()' ).extract_first().strip() item['link'] = "http://br.reuters.com" + response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href' ).extract_first().strip() + "?sp=true" item['headline'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()' ).extract_first().strip() item['date'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[8]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()' ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request item = NewsBotItem() item['title'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()' ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[1]/h3/a/text()' ).extract_first().strip() item['link'] = "http://br.reuters.com" + response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href' ).extract_first().strip() + "?sp=true" item['headline'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()' ).extract_first().strip() item['date'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[9]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()' ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request item = NewsBotItem() item['title'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/text()' ).extract_first().strip() item['category'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[1]/h3/a/text()' ).extract_first().strip() item['link'] = "http://br.reuters.com" + response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/a/@href' ).extract_first().strip() + "?sp=true" item['headline'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/p/text()' ).extract_first().strip() item['date'] = response.xpath( '//*[@id="maincontent"]/div[2]/div[2]/div[1]/div[10]/div[1]/div/div[2]/div[1]/div/div/div/div/h5/span/text()' ).extract_first().strip() request = scrapy.Request(item['link'], callback=self.parse_linkpage) request.meta['item'] = item yield request