Пример #1
0
            callback=self.extract_genre,
            post_extract=self.save_as,
            post_extract_kwargs={
                'file_format': 'json',
                'template_values': {
                    'extractor_name': 'genres'
                },
            },
        )

    def extract_movie(self, element, idx, **kwargs):
        return {
            'title':
            element.css('span[title] a').xpath('string()').extract_first()
        }

    def extract_genre(self, element, idx, **kwargs):
        return {'name': element.css('a').xpath('string()').extract_first()}


my_scraper = Scraper(dispatch_cls=MyDispatch, extract_cls=MyExtract)

if __name__ == '__main__':
    import logging
    logging.basicConfig(
        level=logging.INFO,
        format=
        '%(asctime)s - %(levelname)s - %(name)s - [%(scraper_name)s] %(message)s'
    )
    run_cli(my_scraper)
Пример #2
0
class MyExtract(Extract):

    def extract(self, raw_source, source_idx):
        if source_idx == 1:
            # Only need to extract the data of the second request
            yield self.extract_task(
                name='products',
                raw_source=json.loads(raw_source)['products'],
                callback=self.extract_product,
                post_extract=self.save_as,
                post_extract_kwargs={'file_format': 'json'},
            )

    def extract_product(self, item, idx, **kwargs):
        # Return data so it is still saved
        return {'title': item.get('description', '')}


my_scraper = Scraper(dispatch_cls=MyDispatch,
                     download_cls=MyDownload,
                     extract_cls=MyExtract)

if __name__ == '__main__':
    import logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(name)s - [%(scraper_name)s] %(message)s'
    )
    run_cli(my_scraper)