Exemplo n.º 1
0
class TopicScrapingTask:
    def __init__(self, topic_scraper, article_scraping_task):
        """
        @type topic_scraper: Scraper
        @type article_scraping_task: ArticleScrapingTask
        """
        self._topic_scraper = topic_scraper
        self._article_scraping_task = article_scraping_task
        self._logger = Logger(self.__class__.__name__)

    def run(self, topic_url):
        for parser in self._topic_scraper.scrape(topic_url):
            assert isinstance(parser, TopicParser)
            self._logger.info('Scraping topic at %s.' % topic_url)

            articles = []
            for article in parser.get_articles():
                try:
                    if self._article_scraping_task.run(article):
                        articles.append(article)
                    else:
                        self._logger.warn('Could not parse article body at %s', article.url)

                except IOError, e:
                    self._logger.error('Failed scraping article: %s' % e)
                    continue

            return articles