def update_feed(url, etag=None, modified=None, subscribers=1, request_timeout=10, backoff_factor=1, error=None, link=None, title=None, hub=None): from .models import UniqueFeed try: UniqueFeed.objects.update_feed(url, etag=etag, last_modified=modified, subscribers=subscribers, backoff_factor=backoff_factor, previous_error=error, link=link, title=title, hub=hub) except JobTimeoutException: backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1) logger.debug("Job timed out, backing off %s to %s" % ( url, backoff_factor, )) schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor), backoff_factor=backoff_factor, connection=get_redis_connection())
def backoff_feed(self, url, error, backoff_factor): if backoff_factor == UniqueFeed.MAX_BACKOFF - 1: logger.info("reached max backoff factor", url=url, error=error) backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1) schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor), error=error, backoff_factor=backoff_factor, connection=get_redis_connection())
def test_schedule_limit_items_count(self): for i in range(100): schedule_job('foo{0}'.format(i), schedule_in=-1) jobs = list(pending_jobs(limit=10)) self.assertEqual(len(jobs), 10) self.assertEqual(len(list(scheduled_jobs())), 90)
def test_scheduler_backup(self, get): get.return_value = responses(304) feed = FeedFactory.create() with self.assertNumQueries(1): call_command('backup_scheduler') schedule_job(feed.url, schedule_in=10, subscribers=10, etag='foobar', backoff_factor=2, last_update=int(time.time()) + 10, title="f" * 2049) with self.assertNumQueries(1): call_command('backup_scheduler') schedule_job(feed.url, schedule_in=10, title='12') with self.assertNumQueries(1): call_command('backup_scheduler') unique = UniqueFeed.objects.get() self.assertEqual(unique.subscribers, 10) self.assertEqual(unique.backoff_factor, 2) self.assertEqual(unique.etag, 'foobar') self.assertEqual(unique.modified, '') delta = (unique.last_update - timezone.now()).seconds self.assertTrue(5 < delta < 10) for i in range(4): FeedFactory.create() with self.assertNumQueries(5): call_command('backup_scheduler')
def test_errors(self, get): codes = [400, 401, 403, 404, 500, 502, 503] def get_side_effect(): yield responses(304) for code in codes: yield responses(code) get.side_effect = get_side_effect() feed = FeedFactory.create() self.assertEqual(len(get.call_args_list), 1) for code in codes: get.return_value = responses(code) feed = UniqueFeed.objects.get(url=feed.url) self.assertFalse(feed.muted) self.assertEqual(feed.job_details.get('error'), None) self.assertEqual(feed.job_details['backoff_factor'], 1) feed.schedule() data = job_details(feed.url, connection=get_redis_connection()) update_feed(feed.url, backoff_factor=data['backoff_factor']) feed = UniqueFeed.objects.get(url=feed.url) self.assertFalse(feed.muted) data = job_details(feed.url, connection=get_redis_connection()) self.assertEqual(data['error'], code) self.assertEqual(data['backoff_factor'], 2) # Restore status for next iteration schedule_job(feed.url, backoff_factor=1, error=None, schedule_in=0) feed = UniqueFeed.objects.get(url=feed.url) self.assertEqual(feed.job_details.get('error'), None)
def test_job_data(self): schedule_job('fòo', schedule_in=-1, other_arg='lol') self.assertEqual(list(pending_jobs()), [{ 'id': 'fòo', 'other_arg': 'lol' }]) self.assertEqual(len(list(pending_jobs())), 0)
def update_feed( url, etag=None, modified=None, subscribers=1, request_timeout=10, backoff_factor=1, error=None, link=None, title=None, hub=None, ): from .models import UniqueFeed try: UniqueFeed.objects.update_feed( url, etag=etag, last_modified=modified, subscribers=subscribers, backoff_factor=backoff_factor, previous_error=error, link=link, title=title, hub=hub, ) except JobTimeoutException: backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1) logger.debug("Job timed out, backing off %s to %s" % (url, backoff_factor)) schedule_job( url, schedule_in=UniqueFeed.delay(backoff_factor), backoff_factor=backoff_factor, connection=get_redis_connection(), )
def test_scheduled_jobs(self): schedule_job('jòb', schedule_in=10) schedule_job('ötherjòb', schedule_in=20) schedule = scheduled_jobs(with_times=True) self.assertEqual([s[0] for s in schedule], ['jòb', 'ötherjòb']) schedule = list(scheduled_jobs()) self.assertEqual(schedule, ['jòb', 'ötherjòb'])
def test_manage_feed(self, get): get.return_value = responses(304) user = UserFactory.create() url = reverse('feeds:manage') response = self.app.get(url, user=user) self.assertContains(response, 'Manage feeds') FeedFactory.create(user=user, category=None) FeedFactory.create(user=user, category=None) FeedFactory.create(user=user, category=None) unique = UniqueFeed.objects.all()[0] schedule_job(unique.url, schedule_in=0, backoff_factor=10, error=UniqueFeed.NOT_A_FEED, connection=get_redis_connection()) response = self.app.get(url, user=user) self.assertContains(response, 'Not a valid RSS/Atom feed') schedule_job(unique.url, schedule_in=0, error='blah', connection=get_redis_connection()) response = self.app.get(url, user=user) self.assertContains(response, 'Error') unique.muted = True unique.save() response = self.app.get(url, user=user) self.assertContains(response, 'Error')
def test_job_details(self): schedule_job('details', schedule_in=-1, stuff='baz', other=123) self.assertEqual(job_details('details'), { 'id': 'details', 'stuff': 'baz', 'schedule_at': int(time.time()) - 1, 'other': 123, })
def backoff_feed(self, url, error, backoff_factor): if backoff_factor == UniqueFeed.MAX_BACKOFF - 1: logger.debug(u"{0} reached max backoff period ({1})".format( url, error, )) backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1) schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor), error=error, backoff_factor=backoff_factor, connection=get_redis_connection())
def test_job_details(self): schedule_job('details', schedule_in=-1, stuff='baz', other=123) self.assertEqual( job_details('details'), { 'id': 'details', 'stuff': 'baz', 'schedule_at': int(time.time()) - 1, 'other': 123, })
def test_remove_keys(self): schedule_job('foobar', schedule_in=-1, attr='stuff', other=12, thing='blah blah') jobs = list(pending_jobs()) self.assertEqual(jobs, [{'id': 'foobar', 'attr': 'stuff', 'other': 12, 'thing': 'blah blah'}]) schedule_job('foobar', schedule_in=-1, attr=None, other=None, thing='blah blah') jobs = list(pending_jobs()) self.assertEqual(jobs, [{'id': 'foobar', 'thing': 'blah blah'}])
def test_legacy_redis(self): connection = redis.Redis(**REDIS) for i in range(10): schedule_job('foo{0}'.format(i), schedule_in=-1, connection=connection) jobs = list(pending_jobs(connection=connection, reschedule_in=-1)) self.assertEqual(len(jobs), 10) jobs = list(pending_jobs(connection=r, reschedule_in=-1)) self.assertEqual(len(jobs), 10)
def test_reschedule(self): schedule_job('baz', schedule_in=-1) schedule_job('foo', schedule_in=10) jobs = list(pending_jobs(reschedule_in=20)) self.assertEqual(jobs, [{'id': 'baz'}]) schedule = list(scheduled_jobs(with_times=True)) foo = schedule[0] baz = schedule[1] self.assertEqual(foo[0], ('foo')) self.assertEqual(baz[0], ('baz')) self.assertEqual(foo[1] + 10, baz[1])
def test_uniquefeed_model(self, get): get.return_value = responses(304) FeedFactory.create(url='http://example.com/' + 'foo/' * 200) unique = UniqueFeed.objects.get() self.assertEqual(len(unique.truncated_url()), 50) unique.delete() FeedFactory.create(url='http://example.com/foo/') unique = UniqueFeed.objects.get() self.assertEqual(len(unique.truncated_url()), len(unique.url)) unique = UniqueFeed(url='http://foo.com') self.assertEqual('%s' % unique, 'http://foo.com') self.assertIs(UniqueFeedManager.entry_data({}, None), None) unique.schedule() details = unique.job_details at = details.pop('schedule_at') details.pop('last_update') self.assertEqual(details, { u"backoff_factor": 1, u"subscribers": 1, u"id": "http://foo.com", }) details['schedule_at'] = at self.assertEqual(unique.job_details['id'], "http://foo.com") self.assertTrue(unique.scheduler_data.startswith("{\n")) self.assertTrue(unique.next_update > timezone.now()) self.assertTrue( unique.next_update < timezone.now() + timedelta(seconds=60 * 61)) schedule_job(unique.url, title='Lol', schedule_in=0) del unique._job_details details = unique.job_details details.pop('schedule_at') details.pop('last_update') self.assertEqual( details, { u"title": u"Lol", u"backoff_factor": 1, u"subscribers": 1, u"id": "http://foo.com", })
def test_uniquefeed_model(self, get): get.return_value = responses(304) FeedFactory.create(url='http://example.com/' + 'foo/' * 200) unique = UniqueFeed.objects.get() self.assertEqual(len(unique.truncated_url()), 50) unique.delete() FeedFactory.create(url='http://example.com/foo/') unique = UniqueFeed.objects.get() self.assertEqual(len(unique.truncated_url()), len(unique.url)) unique = UniqueFeed(url='http://foo.com') self.assertEqual('%s' % unique, 'http://foo.com') self.assertIs(UniqueFeedManager.entry_data({}, None), None) unique.schedule() details = unique.job_details at = details.pop('schedule_at') details.pop('last_update') self.assertEqual(details, { u"backoff_factor": 1, u"subscribers": 1, u"id": "http://foo.com", }) details['schedule_at'] = at self.assertEqual(unique.job_details['id'], "http://foo.com") self.assertTrue(unique.scheduler_data.startswith("{\n")) self.assertTrue(unique.next_update > timezone.now()) self.assertTrue(unique.next_update < timezone.now() + timedelta(seconds=60 * 61)) schedule_job(unique.url, title='Lol', schedule_in=0) del unique._job_details details = unique.job_details details.pop('schedule_at') details.pop('last_update') self.assertEqual(details, { u"title": u"Lol", u"backoff_factor": 1, u"subscribers": 1, u"id": "http://foo.com", })
def schedule(self, schedule_in=None, **job): if hasattr(self, '_job_details'): del self._job_details connection = get_redis_connection() kwargs = { 'subscribers': 1, 'backoff_factor': 1, 'last_update': int(time.time()), } kwargs.update(job) if schedule_in is None: try: for attr in self.JOB_ATTRS: if attr in self.job_details: kwargs[attr] = self.job_details[attr] schedule_in = self.schedule_in except JobNotFound: schedule_in = self.delay(kwargs['backoff_factor']) schedule_job(self.url, schedule_in=schedule_in, connection=connection, **kwargs)
def update_feed(url, etag=None, modified=None, subscribers=1, request_timeout=10, backoff_factor=1, error=None, link=None, title=None, hub=None): from .models import UniqueFeed try: UniqueFeed.objects.update_feed( url, etag=etag, last_modified=modified, subscribers=subscribers, backoff_factor=backoff_factor, previous_error=error, link=link, title=title, hub=hub) except JobTimeoutException: backoff_factor = min(UniqueFeed.MAX_BACKOFF, backoff_factor + 1) logger.info("job timed out, backing off", url=url, backoff_factor=backoff_factor) schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor), backoff_factor=backoff_factor, connection=get_redis_connection()) except BaseException as e: logger.info("fatal job exception", url=url, exc_info=e) raise
def test_remove_keys(self): schedule_job('foobar', schedule_in=-1, attr='stuff', other=12, thing='blah blah') jobs = list(pending_jobs()) self.assertEqual(jobs, [{ 'id': 'foobar', 'attr': 'stuff', 'other': 12, 'thing': 'blah blah' }]) schedule_job('foobar', schedule_in=-1, attr=None, other=None, thing='blah blah') jobs = list(pending_jobs()) self.assertEqual(jobs, [{'id': 'foobar', 'thing': 'blah blah'}])
def test_schedule_with_id(self): with self.assertRaises(RuntimeError): schedule_job('testing', schedule_in=1, id=12)
def test_ordering(self): schedule_job('foo', schedule_in=-1) schedule_job('bar', schedule_in=-2) jobs = list(pending_jobs()) self.assertEqual(jobs[0]['id'], 'bar') self.assertEqual(jobs[1]['id'], 'foo')
def test_schedule_without_delay(self): with self.assertRaises(TypeError): schedule_job('trololol')
def test_schedule_with_timedelta(self): schedule_job('delta', schedule_in=timedelta(seconds=-1))
def test_schedule_non_unicode_data(self): schedule_job('bad', schedule_in=-1, etag=b'2013/6/29 \xa4W\xa4\xc8 09:51:31') job = list(pending_jobs())[0] self.assertEqual(job['etag'], b'2013/6/29 \xa4W\xa4\xc8 09:51:31')
def test_reschedule_existing(self): schedule_job('lol', schedule_in=-1) schedule_job('lol', schedule_in=10) self.assertEqual(len(list(pending_jobs())), 0) schedule_job('lol', schedule_in=-1) self.assertEqual(len(list(pending_jobs())), 1)
def test_schedule_in_future(self): schedule_job('lol', schedule_in=10) self.assertEqual(len(list(pending_jobs())), 0) delete_job('lol')
def test_job_deletion(self): schedule_job('bar', schedule_in=-1) delete_job('bar') self.assertEqual(len(list(pending_jobs())), 0)
def test_job_data(self): schedule_job('fòo', schedule_in=-1, other_arg='lol') self.assertEqual(list(pending_jobs()), [{'id': 'fòo', 'other_arg': 'lol'}]) self.assertEqual(len(list(pending_jobs())), 0)
def test_custom_connection(self): for i in range(10): schedule_job('foo{0}'.format(i), schedule_in=-1, connection=r) jobs = list(pending_jobs(connection=r)) self.assertEqual(len(jobs), 10)
def update_feed(self, url, etag=None, last_modified=None, subscribers=1, backoff_factor=1, previous_error=None, link=None, title=None, hub=None): url = URLObject(url) # Check if this domain has rate-limiting rules ratelimit_key = 'ratelimit:{0}'.format( url.netloc.without_auth().without_port()) retry_at = cache.get(ratelimit_key) if retry_at: retry_in = (epoch_to_utc(retry_at) - timezone.now()).seconds schedule_job(url, schedule_in=retry_in, connection=get_redis_connection()) return if subscribers == 1: subscribers_text = '1 subscriber' else: subscribers_text = '{0} subscribers'.format(subscribers) headers = { 'User-Agent': USER_AGENT % subscribers_text, 'Accept': feedparser.ACCEPT_HEADER, } if last_modified: headers['If-Modified-Since'] = force_bytes(last_modified) if etag: headers['If-None-Match'] = force_bytes(etag) if settings.TESTS: # Make sure requests.get is properly mocked during tests if str(type(requests.get)) != "<class 'mock.MagicMock'>": raise ValueError("Not Mocked") auth = None if url.auth != (None, None): auth = url.auth start = datetime.datetime.now() error = None try: response = requests.get( six.text_type(url.without_auth()), headers=headers, auth=auth, timeout=UniqueFeed.request_timeout(backoff_factor)) except (requests.RequestException, socket.timeout, socket.error, IncompleteRead, DecodeError) as e: logger.debug("Error fetching %s, %s" % (url, str(e))) if isinstance(e, IncompleteRead): error = UniqueFeed.CONNECTION_ERROR elif isinstance(e, DecodeError): error = UniqueFeed.DECODE_ERROR else: error = UniqueFeed.TIMEOUT self.backoff_feed(url, error, backoff_factor) return except LocationParseError: logger.debug(u"Failed to parse URL for {0}".format(url)) self.mute_feed(url, UniqueFeed.PARSE_ERROR) return elapsed = (datetime.datetime.now() - start).seconds ctype = response.headers.get('Content-Type', None) if (response.history and url != response.url and ctype is not None and ( ctype.startswith('application') or ctype.startswith('text/xml') or ctype.startswith('text/rss'))): redirection = None for index, redirect in enumerate(response.history): if redirect.status_code != 301: break # Actual redirection is next request's url try: redirection = response.history[index + 1].url except IndexError: # next request is final request redirection = response.url if redirection is not None and redirection != url: self.handle_redirection(url, redirection) update = {'last_update': int(time.time())} if response.status_code == 410: logger.debug(u"Feed gone, {0}".format(url)) self.mute_feed(url, UniqueFeed.GONE) return elif response.status_code in [400, 401, 403, 404, 500, 502, 503]: self.backoff_feed(url, str(response.status_code), backoff_factor) return elif response.status_code not in [200, 204, 304]: logger.debug(u"{0} returned {1}".format(url, response.status_code)) if response.status_code == 429: # Too Many Requests # Prevent next jobs from fetching the URL before retry-after retry_in = int(response.headers.get('Retry-After', 60)) retry_at = timezone.now() + datetime.timedelta( seconds=retry_in) cache.set(ratelimit_key, int(retry_at.strftime('%s')), retry_in) schedule_job(url, schedule_in=retry_in) return else: # Avoid going back to 1 directly if it isn't safe given the # actual response time. if previous_error and error is None: update['error'] = None backoff_factor = min(backoff_factor, self.safe_backoff(elapsed)) update['backoff_factor'] = backoff_factor if response.status_code == 304: schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor, hub), connection=get_redis_connection(), **update) return if 'etag' in response.headers: update['etag'] = response.headers['etag'] else: update['etag'] = None if 'last-modified' in response.headers: update['modified'] = response.headers['last-modified'] else: update['modified'] = None try: if not response.content: content = ' ' # chardet won't detect encoding on empty strings else: content = response.content except socket.timeout: logger.debug(u'{0} timed out'.format(url)) self.backoff_feed(url, UniqueFeed.TIMEOUT, backoff_factor) return parsed = feedparser.parse(content) if not is_feed(parsed): self.backoff_feed(url, UniqueFeed.NOT_A_FEED, UniqueFeed.MAX_BACKOFF) return if 'link' in parsed.feed and parsed.feed.link != link: update['link'] = parsed.feed.link if 'title' in parsed.feed and parsed.feed.title != title: update['title'] = parsed.feed.title if 'links' in parsed.feed: for link in parsed.feed.links: if link.rel == 'hub': update['hub'] = link.href if 'hub' not in update: update['hub'] = None else: subs_key = u'pshb:{0}'.format(url) enqueued = cache.get(subs_key) if not enqueued and not settings.DEBUG: cache.set(subs_key, True, 3600 * 24) enqueue(ensure_subscribed, args=[url, update['hub']], queue='low') schedule_job(url, schedule_in=UniqueFeed.delay( update.get('backoff_factor', backoff_factor), update['hub']), connection=get_redis_connection(), **update) entries = list(filter( None, [self.entry_data(entry, parsed) for entry in parsed.entries] )) if len(entries): enqueue(store_entries, args=[url, entries], queue='store')
def update_feed(self, url, etag=None, last_modified=None, subscribers=1, backoff_factor=1, previous_error=None, link=None, title=None, hub=None): url = URLObject(url) try: domain = url.netloc.without_auth().without_port() except TypeError as e: logger.info("invalid URL", url=url, exc_info=e) self.mute_feed(url, UniqueFeed.PARSE_ERROR) return # Check if this domain has rate-limiting rules ratelimit_key = 'ratelimit:{0}'.format(domain) retry_at = cache.get(ratelimit_key) if retry_at: retry_in = (epoch_to_utc(retry_at) - timezone.now()).seconds schedule_job(url, schedule_in=retry_in, connection=get_redis_connection()) return if subscribers == 1: subscribers_text = '1 subscriber' else: subscribers_text = '{0} subscribers'.format(subscribers) headers = { 'User-Agent': USER_AGENT % subscribers_text, 'Accept': feedparser.ACCEPT_HEADER, } if last_modified: headers['If-Modified-Since'] = force_bytes(last_modified) if etag: headers['If-None-Match'] = force_bytes(etag) if last_modified or etag: headers['A-IM'] = b'feed' if settings.TESTS: # Make sure requests.get is properly mocked during tests if str(type(requests.get)) != "<class 'unittest.mock.MagicMock'>": raise ValueError("Not Mocked") auth = None if url.auth != (None, None): auth = url.auth start = datetime.datetime.now() error = None try: response = requests.get( six.text_type(url.without_auth()), headers=headers, auth=auth, timeout=UniqueFeed.request_timeout(backoff_factor)) except (requests.RequestException, socket.timeout, socket.error, IncompleteRead, DecodeError) as e: logger.info("error fetching", url=url, exc_info=e) if isinstance(e, IncompleteRead): error = UniqueFeed.CONNECTION_ERROR elif isinstance(e, DecodeError): error = UniqueFeed.DECODE_ERROR else: error = UniqueFeed.TIMEOUT self.backoff_feed(url, error, backoff_factor) return except LocationParseError as e: logger.info("failed to parse URL", url=url, exc_info=e) self.mute_feed(url, UniqueFeed.PARSE_ERROR) return elapsed = (datetime.datetime.now() - start).seconds ctype = response.headers.get('Content-Type', None) if (response.history and url != response.url and ctype is not None and (ctype.startswith('application') or ctype.startswith('text/xml') or ctype.startswith('text/rss'))): redirection = None for index, redirect in enumerate(response.history): if redirect.status_code != 301: break # Actual redirection is next request's url try: redirection = response.history[index + 1].url except IndexError: # next request is final request redirection = response.url if redirection is not None and redirection != url: self.handle_redirection(url, redirection) update = {'last_update': int(time.time())} if response.status_code == 410: logger.info("feed gone", url=url) self.mute_feed(url, UniqueFeed.GONE) return elif response.status_code in {400, 401, 403, 404, 500, 502, 503, 521}: self.backoff_feed(url, str(response.status_code), backoff_factor) return elif response.status_code == 429: # Too Many Requests # Prevent next jobs from fetching the URL before retry-after retry_in = int(response.headers.get('Retry-After', 60)) retry_at = timezone.now() + datetime.timedelta(seconds=retry_in) cache.set(ratelimit_key, int(retry_at.strftime('%s')), retry_in) schedule_job(url, schedule_in=retry_in) return elif response.status_code not in {200, 204, 226, 304}: logger.info("non-standard status code", url=url, status_code=response.status_code) else: # Avoid going back to 1 directly if it isn't safe given the # actual response time. if previous_error and error is None: update['error'] = None backoff_factor = min(backoff_factor, self.safe_backoff(elapsed)) update['backoff_factor'] = backoff_factor if response.status_code == 304: schedule_job(url, schedule_in=UniqueFeed.delay(backoff_factor, hub), connection=get_redis_connection(), **update) return if 'etag' in response.headers: update['etag'] = response.headers['etag'] else: update['etag'] = None if 'last-modified' in response.headers: update['modified'] = response.headers['last-modified'] else: update['modified'] = None try: if not response.content: content = ' ' # chardet won't detect encoding on empty strings else: content = response.content except socket.timeout: logger.info('timed out', url=url) self.backoff_feed(url, UniqueFeed.TIMEOUT, backoff_factor) return parsed = feedparser.parse(content) if not is_feed(parsed): self.backoff_feed(url, UniqueFeed.NOT_A_FEED, UniqueFeed.MAX_BACKOFF) return if 'link' in parsed.feed and parsed.feed.link != link: update['link'] = parsed.feed.link if 'title' in parsed.feed and parsed.feed.title != title: update['title'] = parsed.feed.title if 'links' in parsed.feed: for link in parsed.feed.links: if link.rel == 'hub': update['hub'] = link.href if 'hub' not in update: update['hub'] = None else: subs_key = u'pshb:{0}'.format(url) enqueued = cache.get(subs_key) if not enqueued and not settings.DEBUG: cache.set(subs_key, True, 3600 * 24) enqueue(ensure_subscribed, args=[url, update['hub']], queue='low') schedule_job(url, schedule_in=UniqueFeed.delay( update.get('backoff_factor', backoff_factor), update['hub']), connection=get_redis_connection(), **update) entries = list( filter( None, [self.entry_data(entry, parsed) for entry in parsed.entries])) if len(entries): enqueue(store_entries, args=[url, entries], queue='store')