Пример #1
0
def update_feed(url,
                etag=None,
                modified=None,
                subscribers=1,
                request_timeout=10,
                backoff_factor=1,
                error=None,
                link=None,
                title=None,
                hub=None):
    from .models import UniqueFeed
    try:
        UniqueFeed.objects.update_feed(url,
                                       etag=etag,
                                       last_modified=modified,
                                       subscribers=subscribers,
                                       backoff_factor=backoff_factor,
                                       previous_error=error,
                                       link=link,
                                       title=title,
                                       hub=hub)
    except JobTimeoutException:
        backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1)
        logger.debug("Job timed out, backing off %s to %s" % (
            url,
            backoff_factor,
        ))
        schedule_job(url,
                     schedule_in=UniqueFeed.delay(backoff_factor),
                     backoff_factor=backoff_factor,
                     connection=get_redis_connection())
Пример #2
0
 def backoff_feed(self, url, error, backoff_factor):
     if backoff_factor == UniqueFeed.MAX_BACKOFF - 1:
         logger.info("reached max backoff factor", url=url, error=error)
     backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1)
     schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor),
                  error=error, backoff_factor=backoff_factor,
                  connection=get_redis_connection())
Пример #3
0
    def test_schedule_limit_items_count(self):
        for i in range(100):
            schedule_job('foo{0}'.format(i), schedule_in=-1)

        jobs = list(pending_jobs(limit=10))
        self.assertEqual(len(jobs), 10)
        self.assertEqual(len(list(scheduled_jobs())), 90)
Пример #4
0
    def test_scheduler_backup(self, get):
        get.return_value = responses(304)

        feed = FeedFactory.create()
        with self.assertNumQueries(1):
            call_command('backup_scheduler')

        schedule_job(feed.url, schedule_in=10, subscribers=10, etag='foobar',
                     backoff_factor=2, last_update=int(time.time()) + 10,
                     title="f" * 2049)

        with self.assertNumQueries(1):
            call_command('backup_scheduler')

        schedule_job(feed.url, schedule_in=10, title='12')

        with self.assertNumQueries(1):
            call_command('backup_scheduler')

        unique = UniqueFeed.objects.get()
        self.assertEqual(unique.subscribers, 10)
        self.assertEqual(unique.backoff_factor, 2)
        self.assertEqual(unique.etag, 'foobar')
        self.assertEqual(unique.modified, '')
        delta = (unique.last_update - timezone.now()).seconds
        self.assertTrue(5 < delta < 10)

        for i in range(4):
            FeedFactory.create()
        with self.assertNumQueries(5):
            call_command('backup_scheduler')
Пример #5
0
    def test_errors(self, get):
        codes = [400, 401, 403, 404, 500, 502, 503]

        def get_side_effect():
            yield responses(304)
            for code in codes:
                yield responses(code)

        get.side_effect = get_side_effect()
        feed = FeedFactory.create()
        self.assertEqual(len(get.call_args_list), 1)

        for code in codes:
            get.return_value = responses(code)
            feed = UniqueFeed.objects.get(url=feed.url)
            self.assertFalse(feed.muted)
            self.assertEqual(feed.job_details.get('error'), None)
            self.assertEqual(feed.job_details['backoff_factor'], 1)
            feed.schedule()
            data = job_details(feed.url, connection=get_redis_connection())

            update_feed(feed.url, backoff_factor=data['backoff_factor'])

            feed = UniqueFeed.objects.get(url=feed.url)
            self.assertFalse(feed.muted)
            data = job_details(feed.url, connection=get_redis_connection())
            self.assertEqual(data['error'], code)
            self.assertEqual(data['backoff_factor'], 2)

            # Restore status for next iteration
            schedule_job(feed.url, backoff_factor=1, error=None, schedule_in=0)
            feed = UniqueFeed.objects.get(url=feed.url)
            self.assertEqual(feed.job_details.get('error'), None)
Пример #6
0
 def test_job_data(self):
     schedule_job('fòo', schedule_in=-1, other_arg='lol')
     self.assertEqual(list(pending_jobs()), [{
         'id': 'fòo',
         'other_arg': 'lol'
     }])
     self.assertEqual(len(list(pending_jobs())), 0)
Пример #7
0
def update_feed(
    url,
    etag=None,
    modified=None,
    subscribers=1,
    request_timeout=10,
    backoff_factor=1,
    error=None,
    link=None,
    title=None,
    hub=None,
):
    from .models import UniqueFeed

    try:
        UniqueFeed.objects.update_feed(
            url,
            etag=etag,
            last_modified=modified,
            subscribers=subscribers,
            backoff_factor=backoff_factor,
            previous_error=error,
            link=link,
            title=title,
            hub=hub,
        )
    except JobTimeoutException:
        backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1)
        logger.debug("Job timed out, backing off %s to %s" % (url, backoff_factor))
        schedule_job(
            url,
            schedule_in=UniqueFeed.delay(backoff_factor),
            backoff_factor=backoff_factor,
            connection=get_redis_connection(),
        )
Пример #8
0
 def test_scheduled_jobs(self):
     schedule_job('jòb', schedule_in=10)
     schedule_job('ötherjòb', schedule_in=20)
     schedule = scheduled_jobs(with_times=True)
     self.assertEqual([s[0] for s in schedule], ['jòb', 'ötherjòb'])
     schedule = list(scheduled_jobs())
     self.assertEqual(schedule, ['jòb', 'ötherjòb'])
Пример #9
0
    def test_schedule_limit_items_count(self):
        for i in range(100):
            schedule_job('foo{0}'.format(i), schedule_in=-1)

        jobs = list(pending_jobs(limit=10))
        self.assertEqual(len(jobs), 10)
        self.assertEqual(len(list(scheduled_jobs())), 90)
Пример #10
0
    def test_manage_feed(self, get):
        get.return_value = responses(304)
        user = UserFactory.create()
        url = reverse('feeds:manage')
        response = self.app.get(url, user=user)
        self.assertContains(response, 'Manage feeds')

        FeedFactory.create(user=user, category=None)
        FeedFactory.create(user=user, category=None)
        FeedFactory.create(user=user, category=None)
        unique = UniqueFeed.objects.all()[0]
        schedule_job(unique.url,
                     schedule_in=0,
                     backoff_factor=10,
                     error=UniqueFeed.NOT_A_FEED,
                     connection=get_redis_connection())

        response = self.app.get(url, user=user)
        self.assertContains(response, 'Not a valid RSS/Atom feed')

        schedule_job(unique.url,
                     schedule_in=0,
                     error='blah',
                     connection=get_redis_connection())
        response = self.app.get(url, user=user)
        self.assertContains(response, 'Error')

        unique.muted = True
        unique.save()
        response = self.app.get(url, user=user)
        self.assertContains(response, 'Error')
Пример #11
0
    def test_errors(self, get):
        codes = [400, 401, 403, 404, 500, 502, 503]

        def get_side_effect():
            yield responses(304)
            for code in codes:
                yield responses(code)
        get.side_effect = get_side_effect()
        feed = FeedFactory.create()
        self.assertEqual(len(get.call_args_list), 1)

        for code in codes:
            get.return_value = responses(code)
            feed = UniqueFeed.objects.get(url=feed.url)
            self.assertFalse(feed.muted)
            self.assertEqual(feed.job_details.get('error'), None)
            self.assertEqual(feed.job_details['backoff_factor'], 1)
            feed.schedule()
            data = job_details(feed.url, connection=get_redis_connection())

            update_feed(feed.url, backoff_factor=data['backoff_factor'])

            feed = UniqueFeed.objects.get(url=feed.url)
            self.assertFalse(feed.muted)
            data = job_details(feed.url, connection=get_redis_connection())
            self.assertEqual(data['error'], code)
            self.assertEqual(data['backoff_factor'], 2)

            # Restore status for next iteration
            schedule_job(feed.url, backoff_factor=1, error=None, schedule_in=0)
            feed = UniqueFeed.objects.get(url=feed.url)
            self.assertEqual(feed.job_details.get('error'), None)
Пример #12
0
 def test_scheduled_jobs(self):
     schedule_job('jòb', schedule_in=10)
     schedule_job('ötherjòb', schedule_in=20)
     schedule = scheduled_jobs(with_times=True)
     self.assertEqual([s[0] for s in schedule], ['jòb', 'ötherjòb'])
     schedule = list(scheduled_jobs())
     self.assertEqual(schedule, ['jòb', 'ötherjòb'])
Пример #13
0
    def test_manage_feed(self, get):
        get.return_value = responses(304)
        user = UserFactory.create()
        url = reverse('feeds:manage')
        response = self.app.get(url, user=user)
        self.assertContains(response, 'Manage feeds')

        FeedFactory.create(user=user, category=None)
        FeedFactory.create(user=user, category=None)
        FeedFactory.create(user=user, category=None)
        unique = UniqueFeed.objects.all()[0]
        schedule_job(unique.url, schedule_in=0, backoff_factor=10,
                     error=UniqueFeed.NOT_A_FEED,
                     connection=get_redis_connection())

        response = self.app.get(url, user=user)
        self.assertContains(response, 'Not a valid RSS/Atom feed')

        schedule_job(unique.url, schedule_in=0, error='blah',
                     connection=get_redis_connection())
        response = self.app.get(url, user=user)
        self.assertContains(response, 'Error')

        unique.muted = True
        unique.save()
        response = self.app.get(url, user=user)
        self.assertContains(response, 'Error')
Пример #14
0
    def test_job_details(self):
        schedule_job('details', schedule_in=-1, stuff='baz', other=123)

        self.assertEqual(job_details('details'), {
            'id': 'details',
            'stuff': 'baz',
            'schedule_at': int(time.time()) - 1,
            'other': 123,
        })
Пример #15
0
 def backoff_feed(self, url, error, backoff_factor):
     if backoff_factor == UniqueFeed.MAX_BACKOFF - 1:
         logger.debug(u"{0} reached max backoff period ({1})".format(
             url, error,
         ))
     backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1)
     schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor),
                  error=error, backoff_factor=backoff_factor,
                  connection=get_redis_connection())
Пример #16
0
 def backoff_feed(self, url, error, backoff_factor):
     if backoff_factor == UniqueFeed.MAX_BACKOFF - 1:
         logger.debug(u"{0} reached max backoff period ({1})".format(
             url, error,
         ))
     backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1)
     schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor),
                  error=error, backoff_factor=backoff_factor,
                  connection=get_redis_connection())
Пример #17
0
 def backoff_feed(self, url, error, backoff_factor):
     if backoff_factor == UniqueFeed.MAX_BACKOFF - 1:
         logger.info("reached max backoff factor", url=url, error=error)
     backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1)
     schedule_job(url,
                  schedule_in=UniqueFeed.delay(backoff_factor),
                  error=error,
                  backoff_factor=backoff_factor,
                  connection=get_redis_connection())
Пример #18
0
    def test_job_details(self):
        schedule_job('details', schedule_in=-1, stuff='baz', other=123)

        self.assertEqual(
            job_details('details'), {
                'id': 'details',
                'stuff': 'baz',
                'schedule_at': int(time.time()) - 1,
                'other': 123,
            })
Пример #19
0
    def test_remove_keys(self):
        schedule_job('foobar', schedule_in=-1, attr='stuff', other=12,
                     thing='blah blah')
        jobs = list(pending_jobs())
        self.assertEqual(jobs, [{'id': 'foobar', 'attr': 'stuff',
                                 'other': 12, 'thing': 'blah blah'}])

        schedule_job('foobar', schedule_in=-1, attr=None, other=None,
                     thing='blah blah')
        jobs = list(pending_jobs())
        self.assertEqual(jobs, [{'id': 'foobar', 'thing': 'blah blah'}])
Пример #20
0
    def test_legacy_redis(self):
        connection = redis.Redis(**REDIS)
        for i in range(10):
            schedule_job('foo{0}'.format(i), schedule_in=-1,
                         connection=connection)

        jobs = list(pending_jobs(connection=connection, reschedule_in=-1))
        self.assertEqual(len(jobs), 10)

        jobs = list(pending_jobs(connection=r, reschedule_in=-1))
        self.assertEqual(len(jobs), 10)
Пример #21
0
    def test_reschedule(self):
        schedule_job('baz', schedule_in=-1)
        schedule_job('foo', schedule_in=10)
        jobs = list(pending_jobs(reschedule_in=20))
        self.assertEqual(jobs, [{'id': 'baz'}])

        schedule = list(scheduled_jobs(with_times=True))
        foo = schedule[0]
        baz = schedule[1]
        self.assertEqual(foo[0], ('foo'))
        self.assertEqual(baz[0], ('baz'))
        self.assertEqual(foo[1] + 10, baz[1])
Пример #22
0
    def test_legacy_redis(self):
        connection = redis.Redis(**REDIS)
        for i in range(10):
            schedule_job('foo{0}'.format(i),
                         schedule_in=-1,
                         connection=connection)

        jobs = list(pending_jobs(connection=connection, reschedule_in=-1))
        self.assertEqual(len(jobs), 10)

        jobs = list(pending_jobs(connection=r, reschedule_in=-1))
        self.assertEqual(len(jobs), 10)
Пример #23
0
    def test_reschedule(self):
        schedule_job('baz', schedule_in=-1)
        schedule_job('foo', schedule_in=10)
        jobs = list(pending_jobs(reschedule_in=20))
        self.assertEqual(jobs, [{'id': 'baz'}])

        schedule = list(scheduled_jobs(with_times=True))
        foo = schedule[0]
        baz = schedule[1]
        self.assertEqual(foo[0], ('foo'))
        self.assertEqual(baz[0], ('baz'))
        self.assertEqual(foo[1] + 10, baz[1])
Пример #24
0
    def test_uniquefeed_model(self, get):
        get.return_value = responses(304)
        FeedFactory.create(url='http://example.com/' + 'foo/' * 200)
        unique = UniqueFeed.objects.get()
        self.assertEqual(len(unique.truncated_url()), 50)

        unique.delete()

        FeedFactory.create(url='http://example.com/foo/')
        unique = UniqueFeed.objects.get()
        self.assertEqual(len(unique.truncated_url()), len(unique.url))

        unique = UniqueFeed(url='http://foo.com')
        self.assertEqual('%s' % unique, 'http://foo.com')

        self.assertIs(UniqueFeedManager.entry_data({}, None), None)

        unique.schedule()
        details = unique.job_details
        at = details.pop('schedule_at')
        details.pop('last_update')
        self.assertEqual(details, {
            u"backoff_factor": 1,
            u"subscribers": 1,
            u"id": "http://foo.com",
        })
        details['schedule_at'] = at
        self.assertEqual(unique.job_details['id'], "http://foo.com")

        self.assertTrue(unique.scheduler_data.startswith("{\n"))

        self.assertTrue(unique.next_update > timezone.now())
        self.assertTrue(
            unique.next_update < timezone.now() + timedelta(seconds=60 * 61))

        schedule_job(unique.url, title='Lol', schedule_in=0)
        del unique._job_details
        details = unique.job_details
        details.pop('schedule_at')
        details.pop('last_update')
        self.assertEqual(
            details, {
                u"title": u"Lol",
                u"backoff_factor": 1,
                u"subscribers": 1,
                u"id": "http://foo.com",
            })
Пример #25
0
    def test_uniquefeed_model(self, get):
        get.return_value = responses(304)
        FeedFactory.create(url='http://example.com/' + 'foo/' * 200)
        unique = UniqueFeed.objects.get()
        self.assertEqual(len(unique.truncated_url()), 50)

        unique.delete()

        FeedFactory.create(url='http://example.com/foo/')
        unique = UniqueFeed.objects.get()
        self.assertEqual(len(unique.truncated_url()), len(unique.url))

        unique = UniqueFeed(url='http://foo.com')
        self.assertEqual('%s' % unique, 'http://foo.com')

        self.assertIs(UniqueFeedManager.entry_data({}, None), None)

        unique.schedule()
        details = unique.job_details
        at = details.pop('schedule_at')
        details.pop('last_update')
        self.assertEqual(details, {
            u"backoff_factor": 1,
            u"subscribers": 1,
            u"id": "http://foo.com",
        })
        details['schedule_at'] = at
        self.assertEqual(unique.job_details['id'], "http://foo.com")

        self.assertTrue(unique.scheduler_data.startswith("{\n"))

        self.assertTrue(unique.next_update > timezone.now())
        self.assertTrue(unique.next_update <
                        timezone.now() + timedelta(seconds=60 * 61))

        schedule_job(unique.url, title='Lol', schedule_in=0)
        del unique._job_details
        details = unique.job_details
        details.pop('schedule_at')
        details.pop('last_update')
        self.assertEqual(details, {
            u"title": u"Lol",
            u"backoff_factor": 1,
            u"subscribers": 1,
            u"id": "http://foo.com",
        })
Пример #26
0
 def schedule(self, schedule_in=None, **job):
     if hasattr(self, '_job_details'):
         del self._job_details
     connection = get_redis_connection()
     kwargs = {
         'subscribers': 1,
         'backoff_factor': 1,
         'last_update': int(time.time()),
     }
     kwargs.update(job)
     if schedule_in is None:
         try:
             for attr in self.JOB_ATTRS:
                 if attr in self.job_details:
                     kwargs[attr] = self.job_details[attr]
             schedule_in = self.schedule_in
         except JobNotFound:
             schedule_in = self.delay(kwargs['backoff_factor'])
     schedule_job(self.url, schedule_in=schedule_in,
                  connection=connection, **kwargs)
Пример #27
0
def update_feed(url, etag=None, modified=None, subscribers=1,
                request_timeout=10, backoff_factor=1, error=None, link=None,
                title=None, hub=None):
    from .models import UniqueFeed
    try:
        UniqueFeed.objects.update_feed(
            url, etag=etag, last_modified=modified, subscribers=subscribers,
            backoff_factor=backoff_factor, previous_error=error, link=link,
            title=title, hub=hub)
    except JobTimeoutException:
        backoff_factor = min(UniqueFeed.MAX_BACKOFF,
                             backoff_factor + 1)
        logger.info("job timed out, backing off",
                    url=url, backoff_factor=backoff_factor)
        schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor),
                     backoff_factor=backoff_factor,
                     connection=get_redis_connection())
    except BaseException as e:
        logger.info("fatal job exception", url=url, exc_info=e)
        raise
Пример #28
0
 def schedule(self, schedule_in=None, **job):
     if hasattr(self, '_job_details'):
         del self._job_details
     connection = get_redis_connection()
     kwargs = {
         'subscribers': 1,
         'backoff_factor': 1,
         'last_update': int(time.time()),
     }
     kwargs.update(job)
     if schedule_in is None:
         try:
             for attr in self.JOB_ATTRS:
                 if attr in self.job_details:
                     kwargs[attr] = self.job_details[attr]
             schedule_in = self.schedule_in
         except JobNotFound:
             schedule_in = self.delay(kwargs['backoff_factor'])
     schedule_job(self.url, schedule_in=schedule_in,
                  connection=connection, **kwargs)
Пример #29
0
    def test_remove_keys(self):
        schedule_job('foobar',
                     schedule_in=-1,
                     attr='stuff',
                     other=12,
                     thing='blah blah')
        jobs = list(pending_jobs())
        self.assertEqual(jobs, [{
            'id': 'foobar',
            'attr': 'stuff',
            'other': 12,
            'thing': 'blah blah'
        }])

        schedule_job('foobar',
                     schedule_in=-1,
                     attr=None,
                     other=None,
                     thing='blah blah')
        jobs = list(pending_jobs())
        self.assertEqual(jobs, [{'id': 'foobar', 'thing': 'blah blah'}])
Пример #30
0
 def test_schedule_with_id(self):
     with self.assertRaises(RuntimeError):
         schedule_job('testing', schedule_in=1, id=12)
Пример #31
0
 def test_ordering(self):
     schedule_job('foo', schedule_in=-1)
     schedule_job('bar', schedule_in=-2)
     jobs = list(pending_jobs())
     self.assertEqual(jobs[0]['id'], 'bar')
     self.assertEqual(jobs[1]['id'], 'foo')
Пример #32
0
 def test_schedule_without_delay(self):
     with self.assertRaises(TypeError):
         schedule_job('trololol')
Пример #33
0
 def test_ordering(self):
     schedule_job('foo', schedule_in=-1)
     schedule_job('bar', schedule_in=-2)
     jobs = list(pending_jobs())
     self.assertEqual(jobs[0]['id'], 'bar')
     self.assertEqual(jobs[1]['id'], 'foo')
Пример #34
0
 def test_schedule_with_timedelta(self):
     schedule_job('delta', schedule_in=timedelta(seconds=-1))
Пример #35
0
 def test_schedule_non_unicode_data(self):
     schedule_job('bad', schedule_in=-1,
                  etag=b'2013/6/29 \xa4W\xa4\xc8 09:51:31')
     job = list(pending_jobs())[0]
     self.assertEqual(job['etag'], b'2013/6/29 \xa4W\xa4\xc8 09:51:31')
Пример #36
0
 def test_schedule_non_unicode_data(self):
     schedule_job('bad',
                  schedule_in=-1,
                  etag=b'2013/6/29 \xa4W\xa4\xc8 09:51:31')
     job = list(pending_jobs())[0]
     self.assertEqual(job['etag'], b'2013/6/29 \xa4W\xa4\xc8 09:51:31')
Пример #37
0
 def test_reschedule_existing(self):
     schedule_job('lol', schedule_in=-1)
     schedule_job('lol', schedule_in=10)
     self.assertEqual(len(list(pending_jobs())), 0)
     schedule_job('lol', schedule_in=-1)
     self.assertEqual(len(list(pending_jobs())), 1)
Пример #38
0
 def test_reschedule_existing(self):
     schedule_job('lol', schedule_in=-1)
     schedule_job('lol', schedule_in=10)
     self.assertEqual(len(list(pending_jobs())), 0)
     schedule_job('lol', schedule_in=-1)
     self.assertEqual(len(list(pending_jobs())), 1)
Пример #39
0
 def test_schedule_without_delay(self):
     with self.assertRaises(TypeError):
         schedule_job('trololol')
Пример #40
0
 def test_schedule_in_future(self):
     schedule_job('lol', schedule_in=10)
     self.assertEqual(len(list(pending_jobs())), 0)
     delete_job('lol')
Пример #41
0
 def test_job_deletion(self):
     schedule_job('bar', schedule_in=-1)
     delete_job('bar')
     self.assertEqual(len(list(pending_jobs())), 0)
Пример #42
0
 def test_job_data(self):
     schedule_job('fòo', schedule_in=-1, other_arg='lol')
     self.assertEqual(list(pending_jobs()), [{'id': 'fòo',
                                              'other_arg': 'lol'}])
     self.assertEqual(len(list(pending_jobs())), 0)
Пример #43
0
 def test_schedule_in_future(self):
     schedule_job('lol', schedule_in=10)
     self.assertEqual(len(list(pending_jobs())), 0)
     delete_job('lol')
Пример #44
0
    def test_custom_connection(self):
        for i in range(10):
            schedule_job('foo{0}'.format(i), schedule_in=-1, connection=r)

        jobs = list(pending_jobs(connection=r))
        self.assertEqual(len(jobs), 10)
Пример #45
0
    def update_feed(self, url, etag=None, last_modified=None, subscribers=1,
                    backoff_factor=1, previous_error=None, link=None,
                    title=None, hub=None):
        url = URLObject(url)
        # Check if this domain has rate-limiting rules
        ratelimit_key = 'ratelimit:{0}'.format(
            url.netloc.without_auth().without_port())
        retry_at = cache.get(ratelimit_key)
        if retry_at:
            retry_in = (epoch_to_utc(retry_at) - timezone.now()).seconds
            schedule_job(url, schedule_in=retry_in,
                         connection=get_redis_connection())
            return

        if subscribers == 1:
            subscribers_text = '1 subscriber'
        else:
            subscribers_text = '{0} subscribers'.format(subscribers)

        headers = {
            'User-Agent': USER_AGENT % subscribers_text,
            'Accept': feedparser.ACCEPT_HEADER,
        }

        if last_modified:
            headers['If-Modified-Since'] = force_bytes(last_modified)
        if etag:
            headers['If-None-Match'] = force_bytes(etag)

        if settings.TESTS:
            # Make sure requests.get is properly mocked during tests
            if str(type(requests.get)) != "<class 'mock.MagicMock'>":
                raise ValueError("Not Mocked")

        auth = None
        if url.auth != (None, None):
            auth = url.auth

        start = datetime.datetime.now()
        error = None
        try:
            response = requests.get(
                six.text_type(url.without_auth()), headers=headers, auth=auth,
                timeout=UniqueFeed.request_timeout(backoff_factor))
        except (requests.RequestException, socket.timeout, socket.error,
                IncompleteRead, DecodeError) as e:
            logger.debug("Error fetching %s, %s" % (url, str(e)))
            if isinstance(e, IncompleteRead):
                error = UniqueFeed.CONNECTION_ERROR
            elif isinstance(e, DecodeError):
                error = UniqueFeed.DECODE_ERROR
            else:
                error = UniqueFeed.TIMEOUT
            self.backoff_feed(url, error, backoff_factor)
            return
        except LocationParseError:
            logger.debug(u"Failed to parse URL for {0}".format(url))
            self.mute_feed(url, UniqueFeed.PARSE_ERROR)
            return

        elapsed = (datetime.datetime.now() - start).seconds

        ctype = response.headers.get('Content-Type', None)
        if (response.history and
            url != response.url and ctype is not None and (
                ctype.startswith('application') or
                ctype.startswith('text/xml') or
                ctype.startswith('text/rss'))):
            redirection = None
            for index, redirect in enumerate(response.history):
                if redirect.status_code != 301:
                    break
                # Actual redirection is next request's url
                try:
                    redirection = response.history[index + 1].url
                except IndexError:  # next request is final request
                    redirection = response.url

            if redirection is not None and redirection != url:
                self.handle_redirection(url, redirection)

        update = {'last_update': int(time.time())}

        if response.status_code == 410:
            logger.debug(u"Feed gone, {0}".format(url))
            self.mute_feed(url, UniqueFeed.GONE)
            return

        elif response.status_code in [400, 401, 403, 404, 500, 502, 503]:
            self.backoff_feed(url, str(response.status_code), backoff_factor)
            return

        elif response.status_code not in [200, 204, 304]:
            logger.debug(u"{0} returned {1}".format(url, response.status_code))

            if response.status_code == 429:
                # Too Many Requests
                # Prevent next jobs from fetching the URL before retry-after
                retry_in = int(response.headers.get('Retry-After', 60))
                retry_at = timezone.now() + datetime.timedelta(
                    seconds=retry_in)
                cache.set(ratelimit_key,
                          int(retry_at.strftime('%s')),
                          retry_in)
                schedule_job(url, schedule_in=retry_in)
                return

        else:
            # Avoid going back to 1 directly if it isn't safe given the
            # actual response time.
            if previous_error and error is None:
                update['error'] = None
            backoff_factor = min(backoff_factor, self.safe_backoff(elapsed))
            update['backoff_factor'] = backoff_factor

        if response.status_code == 304:
            schedule_job(url,
                         schedule_in=UniqueFeed.delay(backoff_factor, hub),
                         connection=get_redis_connection(), **update)
            return

        if 'etag' in response.headers:
            update['etag'] = response.headers['etag']
        else:
            update['etag'] = None

        if 'last-modified' in response.headers:
            update['modified'] = response.headers['last-modified']
        else:
            update['modified'] = None

        try:
            if not response.content:
                content = ' '  # chardet won't detect encoding on empty strings
            else:
                content = response.content
        except socket.timeout:
            logger.debug(u'{0} timed out'.format(url))
            self.backoff_feed(url, UniqueFeed.TIMEOUT, backoff_factor)
            return

        parsed = feedparser.parse(content)

        if not is_feed(parsed):
            self.backoff_feed(url, UniqueFeed.NOT_A_FEED,
                              UniqueFeed.MAX_BACKOFF)
            return

        if 'link' in parsed.feed and parsed.feed.link != link:
            update['link'] = parsed.feed.link

        if 'title' in parsed.feed and parsed.feed.title != title:
            update['title'] = parsed.feed.title

        if 'links' in parsed.feed:
            for link in parsed.feed.links:
                if link.rel == 'hub':
                    update['hub'] = link.href
        if 'hub' not in update:
            update['hub'] = None
        else:
            subs_key = u'pshb:{0}'.format(url)
            enqueued = cache.get(subs_key)
            if not enqueued and not settings.DEBUG:
                cache.set(subs_key, True, 3600 * 24)
                enqueue(ensure_subscribed, args=[url, update['hub']],
                        queue='low')

        schedule_job(url,
                     schedule_in=UniqueFeed.delay(
                         update.get('backoff_factor', backoff_factor),
                         update['hub']),
                     connection=get_redis_connection(), **update)

        entries = list(filter(
            None,
            [self.entry_data(entry, parsed) for entry in parsed.entries]
        ))
        if len(entries):
            enqueue(store_entries, args=[url, entries], queue='store')
Пример #46
0
    def update_feed(self,
                    url,
                    etag=None,
                    last_modified=None,
                    subscribers=1,
                    backoff_factor=1,
                    previous_error=None,
                    link=None,
                    title=None,
                    hub=None):
        url = URLObject(url)
        try:
            domain = url.netloc.without_auth().without_port()
        except TypeError as e:
            logger.info("invalid URL", url=url, exc_info=e)
            self.mute_feed(url, UniqueFeed.PARSE_ERROR)
            return
        # Check if this domain has rate-limiting rules
        ratelimit_key = 'ratelimit:{0}'.format(domain)
        retry_at = cache.get(ratelimit_key)
        if retry_at:
            retry_in = (epoch_to_utc(retry_at) - timezone.now()).seconds
            schedule_job(url,
                         schedule_in=retry_in,
                         connection=get_redis_connection())
            return

        if subscribers == 1:
            subscribers_text = '1 subscriber'
        else:
            subscribers_text = '{0} subscribers'.format(subscribers)

        headers = {
            'User-Agent': USER_AGENT % subscribers_text,
            'Accept': feedparser.ACCEPT_HEADER,
        }

        if last_modified:
            headers['If-Modified-Since'] = force_bytes(last_modified)
        if etag:
            headers['If-None-Match'] = force_bytes(etag)
        if last_modified or etag:
            headers['A-IM'] = b'feed'

        if settings.TESTS:
            # Make sure requests.get is properly mocked during tests
            if str(type(requests.get)) != "<class 'unittest.mock.MagicMock'>":
                raise ValueError("Not Mocked")

        auth = None
        if url.auth != (None, None):
            auth = url.auth

        start = datetime.datetime.now()
        error = None
        try:
            response = requests.get(
                six.text_type(url.without_auth()),
                headers=headers,
                auth=auth,
                timeout=UniqueFeed.request_timeout(backoff_factor))
        except (requests.RequestException, socket.timeout, socket.error,
                IncompleteRead, DecodeError) as e:
            logger.info("error fetching", url=url, exc_info=e)
            if isinstance(e, IncompleteRead):
                error = UniqueFeed.CONNECTION_ERROR
            elif isinstance(e, DecodeError):
                error = UniqueFeed.DECODE_ERROR
            else:
                error = UniqueFeed.TIMEOUT
            self.backoff_feed(url, error, backoff_factor)
            return
        except LocationParseError as e:
            logger.info("failed to parse URL", url=url, exc_info=e)
            self.mute_feed(url, UniqueFeed.PARSE_ERROR)
            return

        elapsed = (datetime.datetime.now() - start).seconds

        ctype = response.headers.get('Content-Type', None)
        if (response.history and url != response.url and ctype is not None and
            (ctype.startswith('application') or ctype.startswith('text/xml')
             or ctype.startswith('text/rss'))):
            redirection = None
            for index, redirect in enumerate(response.history):
                if redirect.status_code != 301:
                    break
                # Actual redirection is next request's url
                try:
                    redirection = response.history[index + 1].url
                except IndexError:  # next request is final request
                    redirection = response.url

            if redirection is not None and redirection != url:
                self.handle_redirection(url, redirection)

        update = {'last_update': int(time.time())}

        if response.status_code == 410:
            logger.info("feed gone", url=url)
            self.mute_feed(url, UniqueFeed.GONE)
            return

        elif response.status_code in {400, 401, 403, 404, 500, 502, 503, 521}:
            self.backoff_feed(url, str(response.status_code), backoff_factor)
            return

        elif response.status_code == 429:
            # Too Many Requests
            # Prevent next jobs from fetching the URL before retry-after
            retry_in = int(response.headers.get('Retry-After', 60))
            retry_at = timezone.now() + datetime.timedelta(seconds=retry_in)
            cache.set(ratelimit_key, int(retry_at.strftime('%s')), retry_in)
            schedule_job(url, schedule_in=retry_in)
            return

        elif response.status_code not in {200, 204, 226, 304}:
            logger.info("non-standard status code",
                        url=url,
                        status_code=response.status_code)

        else:
            # Avoid going back to 1 directly if it isn't safe given the
            # actual response time.
            if previous_error and error is None:
                update['error'] = None
            backoff_factor = min(backoff_factor, self.safe_backoff(elapsed))
            update['backoff_factor'] = backoff_factor

        if response.status_code == 304:
            schedule_job(url,
                         schedule_in=UniqueFeed.delay(backoff_factor, hub),
                         connection=get_redis_connection(),
                         **update)
            return

        if 'etag' in response.headers:
            update['etag'] = response.headers['etag']
        else:
            update['etag'] = None

        if 'last-modified' in response.headers:
            update['modified'] = response.headers['last-modified']
        else:
            update['modified'] = None

        try:
            if not response.content:
                content = ' '  # chardet won't detect encoding on empty strings
            else:
                content = response.content
        except socket.timeout:
            logger.info('timed out', url=url)
            self.backoff_feed(url, UniqueFeed.TIMEOUT, backoff_factor)
            return

        parsed = feedparser.parse(content)

        if not is_feed(parsed):
            self.backoff_feed(url, UniqueFeed.NOT_A_FEED,
                              UniqueFeed.MAX_BACKOFF)
            return

        if 'link' in parsed.feed and parsed.feed.link != link:
            update['link'] = parsed.feed.link

        if 'title' in parsed.feed and parsed.feed.title != title:
            update['title'] = parsed.feed.title

        if 'links' in parsed.feed:
            for link in parsed.feed.links:
                if link.rel == 'hub':
                    update['hub'] = link.href
        if 'hub' not in update:
            update['hub'] = None
        else:
            subs_key = u'pshb:{0}'.format(url)
            enqueued = cache.get(subs_key)
            if not enqueued and not settings.DEBUG:
                cache.set(subs_key, True, 3600 * 24)
                enqueue(ensure_subscribed,
                        args=[url, update['hub']],
                        queue='low')

        schedule_job(url,
                     schedule_in=UniqueFeed.delay(
                         update.get('backoff_factor', backoff_factor),
                         update['hub']),
                     connection=get_redis_connection(),
                     **update)

        entries = list(
            filter(
                None,
                [self.entry_data(entry, parsed) for entry in parsed.entries]))
        if len(entries):
            enqueue(store_entries, args=[url, entries], queue='store')
Пример #47
0
 def test_job_deletion(self):
     schedule_job('bar', schedule_in=-1)
     delete_job('bar')
     self.assertEqual(len(list(pending_jobs())), 0)
Пример #48
0
 def test_schedule_with_timedelta(self):
     schedule_job('delta', schedule_in=timedelta(seconds=-1))
Пример #49
0
 def test_schedule_with_id(self):
     with self.assertRaises(RuntimeError):
         schedule_job('testing', schedule_in=1, id=12)
Пример #50
0
    def test_custom_connection(self):
        for i in range(10):
            schedule_job('foo{0}'.format(i), schedule_in=-1, connection=r)

        jobs = list(pending_jobs(connection=r))
        self.assertEqual(len(jobs), 10)