예제 #1
0
class SourceConsumer(ConsumerThread):
    def __init__(self, worker_id, task_queue, completed_queue):
        ConsumerThread.__init__(self, worker_id, task_queue, completed_queue)

        self._storage = DatabaseStorage()
        self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser]))

        topic_scraper = Scraper([RSSParser, GenericFrontPageParser])
        article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser])

        sub_task = ArticleScrapingTask(article_scraper)
        self._task = TopicScrapingTask(topic_scraper, sub_task)

    def consume(self, source_url):
        self._logger.info('Consuming source url %s.' % source_url)
        try:
            source = self._crawler.crawl(source_url)

            for topic in source.topics:
                last_scraped = self._storage.get_topic_last_update(topic.url)
                if (last_scraped is not None) and (datetime.utcnow() - last_scraped).total_seconds() < TopicFreshnessSeconds:
                    self._logger.info('Not scraping fresh topic: %s' % topic.url)
                    continue
                try:
                    for article in self._task.run(topic.url):
                        self._storage.insert(article, topic, source)
                except IOError, e:
                    self._logger.error('Failed scraping topic: %s' % e)

        except IOError, e:
            self._logger.error('Failed scraping source: %s' % e)
        except ValueError, e:
            self._logger.error('Failed scraping source: %s' % e)
예제 #2
0
    def __init__(self, worker_id, task_queue, completed_queue):
        ConsumerThread.__init__(self, worker_id, task_queue, completed_queue)

        self._storage = DatabaseStorage()
        self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser]))

        topic_scraper = Scraper([RSSParser, GenericFrontPageParser])
        article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser])

        sub_task = ArticleScrapingTask(article_scraper)
        self._task = TopicScrapingTask(topic_scraper, sub_task)