예제 #1
0
    def testGetCrawlJobs(self):
        """
        test get_crawl_jobs logic

        a query that returns a list of urls that should be crawled.

        the list is determined by the last time it was checked if at
        all and whether it is currently scheduled to be crawled

        the goal is to crawl a url at most 5 minutes
        """
        recs = self.make_data()

        #stuff is scheduled, backdate scheduled so we have the right
        #conditions
        self.scheduled_backdate_recs(recs, 10)

        # at this point we should get 0 results because no job has
        # been marked checked
        result = [r for r in scheduler.get_crawl_jobs()]
        self.assert_(len(result) == 0)

        # if we mark one checked and backdate it to 7 minutes ago we
        # should get results of 1
        self.checked_backdate_recs([recs[0], ], 7)
        result = [r for r in scheduler.get_crawl_jobs()]
        self.assert_(len(result) == 1)
예제 #2
0
    def testSchedulerAndCrawler(self):
        urls = [u"http://feeds.feedburner.com/43folders",
                u"http://advocacy.python.org/podcasts/littlebit.rss",
                u"http://friendfeed.com/alawrence?format=atom",
                u"http://feeds.feedburner.com/antiwar"]

        with transaction.manager:
            for url in urls:
                meta.Session().add(CrawlJobModel(url=url))


        self.assert_(len(list(meta.Session().query(CrawlJobModel).all())))
        self.assert_(len(list(get_crawl_jobs())))

        log.info("telling worker to use database %s" % self.db_url)
        scheduler_bind = "ipc:///tmp/scheduler_socket"
        crawl_bind = "ipc:///tmp/crawler_socket"
        from feederengine import crawler
        with mock(crawler, "proxy", mock_rss_server):
            w = SchedulerWorker(self.db_url, scheduler_bind)

            c = CrawlWorker(scheduler_bind, crawl_bind)

            w.start()
            c.start()

            self.assert_(w.is_alive())
            self.assert_(c.is_alive())

            context = zmq.Context()

            with pull_socket(context, crawl_bind) as subscription:
                count = 0
                tries = 0
                poller = zmq.Poller()
                poller.register(subscription, zmq.POLLIN)
                while count < len(urls) and tries < 100:
                    polled = dict(poller.poll(timeout=100))
                    if subscription in polled and polled[subscription] == zmq.POLLIN:
                        try:
                            url, data = subscription.recv_multipart(zmq.NOBLOCK)
                            count += 1
                        except zmq.ZMQError:
                            log.error("timeout", exc_info=True)
                            time.sleep(.1)
                        else:
                            log.info(data)
                    tries += 1
                    log.info("tries %s and results %s" % (tries, count))

            [w.terminate(), c.terminate()]
            time.sleep(1)
            self.assert_(not w.is_alive())
            self.assert_(not c.is_alive())
            self.assert_(count == len(urls), "didn't get all expected messages")