Exemplo n.º 1
0
def fetch(limit=100, retreive_all=False):
    "Crawl the feeds with the client crawler."
    from crawler.http_crawler import CrawlerScheduler
    scheduler = CrawlerScheduler(conf.CRAWLER_LOGIN, conf.CRAWLER_PASSWD)
    with scheduler.pool:
        scheduler.run(limit=limit, retreive_all=retreive_all)
        scheduler.wait()
Exemplo n.º 2
0
    def test_no_add_on_304(self):
        self.resp_status_code = 304
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))
        scheduler = CrawlerScheduler('admin', 'admin')
        scheduler.run()
        scheduler.wait()

        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))
Exemplo n.º 3
0
    def test_http_crawler_add_articles(self):
        scheduler = CrawlerScheduler('admin', 'admin')
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))

        scheduler.run()
        scheduler.wait()
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(143, len(resp.json()))

        for art in resp.json():
            self.assertFalse('srcset=' in art['content'])
            self.assertFalse('src="/' in art['content'])

        self.resp_status_code = 304
        scheduler.run()
        scheduler.wait()
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(143, len(resp.json()))
Exemplo n.º 4
0
def fetch(limit=100, retreive_all=False):
    "Crawl the feeds with the client crawler."
    from crawler.http_crawler import CrawlerScheduler
    scheduler = CrawlerScheduler(conf.CRAWLER_LOGIN, conf.CRAWLER_PASSWD)
    with scheduler.pool:
        scheduler.run(limit=limit, retreive_all=retreive_all)
        scheduler.wait()
Exemplo n.º 5
0
    def test_no_add_on_304(self):
        scheduler = CrawlerScheduler('admin', 'admin')
        self.resp_status_code = 304
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))

        scheduler.run()
        scheduler.wait()
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))
Exemplo n.º 6
0
    def test_matching_etag(self):
        self._reset_feeds_freshness(etag='fake etag')
        self.resp_headers = {'etag': 'fake etag'}
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))
        scheduler = CrawlerScheduler('admin', 'admin')
        scheduler.run()
        scheduler.wait()

        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))

        self._reset_feeds_freshness(etag='jarr/fake etag')
        self.resp_headers = {'etag': 'jarr/fake etag'}

        scheduler.run()
        scheduler.wait()

        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))

        self._reset_feeds_freshness(etag='jarr/fake etag')
        self.resp_headers = {'etag': '########################'}

        scheduler.run()
        scheduler.wait()

        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(143, len(resp.json()))
Exemplo n.º 7
0
    def test_matching_etag(self):
        self._reset_feeds_freshness(etag='fake etag')
        self.resp_headers = {'etag': 'fake etag'}
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))
        scheduler = CrawlerScheduler('admin', 'admin')

        scheduler.run()
        scheduler.wait()
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))

        self._reset_feeds_freshness(etag='jarr/fake etag')
        self.resp_headers = {'etag': 'jarr/fake etag'}

        scheduler.run()
        scheduler.wait()
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(18, len(resp.json()))

        self._reset_feeds_freshness(etag='jarr/fake etag')
        self.resp_headers = {'etag': '########################'}

        scheduler.run()
        scheduler.wait()
        resp = self._api('get', 'articles', data={'limit': 1000}, user='******')
        self.assertEquals(143, len(resp.json()))