def parse_homepage(self, response): article_urls_raw = response.xpath( "//a[@class = 'js-hlp-LinkSwap js-tsr-Base_ContentLink tsr-Base_ContentLink']//@href" ).extract() article_urls = [url for url in article_urls_raw if url[0:5] == "https"] # article_titles = response.xpath("//a[@class = 'js-hlp-LinkSwap js-tsr-Base_ContentLink tsr-Base_ContentLink']//@title").extract() for ii, article_url in enumerate(article_urls): # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline article = newsItem() article['index'] = ii article['url'] = article_url # check the parse_rubrics if 'politik' in article_url.split('/'): article['rubrics'] = 'politics' elif 'wirtschaft' in article_url.split('/'): article['rubrics'] = 'economics' else: article['rubrics'] = 'homepage' article_request = scrapy.Request(article_url, callback=self.parse_article) # the parser seems to be only able to catch response, not items # the item can be stored in the request/response and transfered to the next parser article_request.meta['item'] = article yield article_request
def parse_homepage(self, response): article_urls_raw = response.xpath( "//article[contains(@class, 'articulo')]//@href").extract() article_urls = [ url for url in article_urls_raw if url[0:18] == "https://elpais.com" ] # article_titles = response.xpath("//a[contains(@class, 'sz-teaser')]//h3[contains(@class, 'sz-teaser__title')]//text()").extract() for ii, article_url in enumerate(article_urls): # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline article = newsItem() article['index'] = ii article['url'] = article_url # check the parse_rubrics if 'politica' in article_url.split('/'): article['rubrics'] = 'politics' elif 'economia' in article_url.split('/'): article['rubrics'] = 'economics' else: article['rubrics'] = 'homepage' article_request = scrapy.Request(article_url, callback=self.parse_article) # the parser seems to be only able to catch response, not items # the item can be stored in the request/response and transfered to the next parser article_request.meta['item'] = article yield article_request
def parse_homepage(self, response): article_urls_raw = response.xpath( "//a[contains(@class, 'css-m47150 esdb6og4')]//@href").extract() # links starting with http are likely to external sites article_urls = [ 'https://www.lefigaro.fr' + url for url in article_urls_raw if not (url[0:4] == "http") ] # article_titles = response.xpath("//a[contains(@class, 'sz-teaser')]//h3[contains(@class, 'sz-teaser__title')]//text()").extract() for ii, article_url in enumerate(article_urls): # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline article = newsItem() article['index'] = ii article['url'] = article_url # check the parse_rubrics if 'politique' in article_url.split('/'): article['rubrics'] = 'politics' elif 'economie' in article_url.split('/'): article['rubrics'] = 'economics' else: article['rubrics'] = 'homepage' article_request = scrapy.Request(article_url, callback=self.parse_article) # the parser seems to be only able to catch response, not items # the item can be stored in the request/response and transfered to the next parser article_request.meta['item'] = article yield article_request
def parse_homepage(self, response): # most bild internal articles ends with .html article_urls_raw = response.xpath( "//a[contains(@href, '.html')]//@href").extract() # Bild.de does not use absolute path for its internal web pages article_urls = [ 'https://www.bild.de' + url for url in article_urls_raw if not (url[0:4] == "http") ] # article_titles = response.xpath("//a[contains(@class, 'sz-teaser')]//h3[contains(@class, 'sz-teaser__title')]//text()").extract() for ii, article_url in enumerate(article_urls): # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline article = newsItem() article['index'] = ii article['url'] = article_url article_request = scrapy.Request(article_url, callback=self.parse_article) # the parser seems to be only able to catch response, not items # the item can be stored in the request/response and transfered to the next parser article_request.meta['item'] = article yield article_request