callback=self.extract_genre, post_extract=self.save_as, post_extract_kwargs={ 'file_format': 'json', 'template_values': { 'extractor_name': 'genres' }, }, ) def extract_movie(self, element, idx, **kwargs): return { 'title': element.css('span[title] a').xpath('string()').extract_first() } def extract_genre(self, element, idx, **kwargs): return {'name': element.css('a').xpath('string()').extract_first()} my_scraper = Scraper(dispatch_cls=MyDispatch, extract_cls=MyExtract) if __name__ == '__main__': import logging logging.basicConfig( level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(name)s - [%(scraper_name)s] %(message)s' ) run_cli(my_scraper)
class MyExtract(Extract): def extract(self, raw_source, source_idx): if source_idx == 1: # Only need to extract the data of the second request yield self.extract_task( name='products', raw_source=json.loads(raw_source)['products'], callback=self.extract_product, post_extract=self.save_as, post_extract_kwargs={'file_format': 'json'}, ) def extract_product(self, item, idx, **kwargs): # Return data so it is still saved return {'title': item.get('description', '')} my_scraper = Scraper(dispatch_cls=MyDispatch, download_cls=MyDownload, extract_cls=MyExtract) if __name__ == '__main__': import logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - [%(scraper_name)s] %(message)s' ) run_cli(my_scraper)