def test_derive_key_name_from_video(self): self._set_responses_xrange(BATCH_SIZE) _task_handler('UUID') videos = Video.all().fetch(BATCH_SIZE) for v in videos: key = VideoSubtitles.get_key_name('en', v.youtube_id) subs = VideoSubtitles.get_by_key_name(key) self.assertIsNotNone(subs)
def test_assume_utf8_encoded_content(self): # Universal Subtitles API returns utf-8 # u'\xc4\xd0' is unicode for the utf-8 byte string '\xc3\x84\xc3\x90' utf8_str = '\xc3\x84\xc3\x90' unicode_str = u'\xc4\xd0' self._set_responses_xrange(1, content=utf8_str) _task_handler('UUID') self.assertEqual(VideoSubtitles.all().count(), 1) subs = VideoSubtitles.all().get() self.assertEqual(subs.json, unicode_str)
def download_subtitles(videos, report=None): if report is None: report = dict(REPORT_TEMPLATE) # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles # at each DEFER_SECONDS interval rpcs = [] for video in videos: url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id) rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS) urlfetch.make_fetch_call(rpc, url) rpcs.append((video.youtube_id, rpc)) report['fetches'] += 1 # Process asynchronous fetches for youtube_id, rpc in rpcs: lang = 'en' key_name = VideoSubtitles.get_key_name(lang, youtube_id) try: resp = rpc.get_result() if resp.status_code != 200: raise RuntimeError('status code: %s' % resp.status_code) if resp.final_url: logging.warn('%s redirect to %s' % (key_name, resp.final_url)) report['redirects'] += 1 json = resp.content.decode('utf-8') # Only update stale records current = VideoSubtitles.get_by_key_name(key_name) if not current or current.json != json: new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id, language=lang, json=json) new.put() report['writes'] += 1 else: logging.info('%s content already up-to-date' % key_name) except Exception, e: logging.error('%s subtitles fetch failed: %s' % (key_name, e)) report['errors'] += 1
def test_process_next_batch_on_nonempty_cursor(self): offset = 3 # these should be skipped, they'll DownloadError for i in xrange(0, offset): Video(youtube_id=str(i)).put() # these should be downloaded self._set_responses_xrange(offset, BATCH_SIZE + offset) query = Video.all() query.fetch(offset) cursor = query.cursor() _task_handler('UUID', cursor=cursor) self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
def test_should_not_put_duplicate_subtitles(self, info): self._set_responses_xrange(BATCH_SIZE, content="some json") # first fetch _task_handler('UUID', 0) self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE) self.assertEqual(info.call_count, 0) with patch('unisubs.VideoSubtitles') as MockVideoSubtitles: MockVideoSubtitles.get_key_name = VideoSubtitles.get_key_name MockVideoSubtitles.get_by_key_name = VideoSubtitles.get_by_key_name # second fetch, same content _task_handler('UUID', 1) self.assertEqual(MockVideoSubtitles.return_value.put.call_count, 0, 'duplicate subtitles should not be put()') self.assertEqual(info.call_count, BATCH_SIZE, 'skipped put should be logged')
def test_process_first_batch_on_empty_cursor(self): self._set_responses_xrange(BATCH_SIZE) _task_handler('UUID') self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
def _task_handler(uid, task_id=0, cursor=None, report=None): """Task chain for fetching subtitles from the Universal Subtitles API It processes Video models in batches of BATCH_SIZE by fetching the English subtitles via an HTTP API call. This job runs regularly so fetch failures are fixed from run-to-run. Fetch failures are logged and suppressed as the task marches on. Errors include URL fetch timeouts, subtitles put failures, and response decoding failures. HTTP redirects indicate that the code needs updating to a new API endpoint. They are detected and reported separately. """ query = Video.all() query.with_cursor(cursor) videos = query.fetch(BATCH_SIZE) if report is None: report = dict(REPORT_TEMPLATE) VideoSubtitlesFetchReport(key_name=uid, **report).put() # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles # at each DEFER_SECONDS interval rpcs = [] for video in videos: url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id) rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS) urlfetch.make_fetch_call(rpc, url) rpcs.append((video.youtube_id, rpc)) report['fetches'] += 1 # Process asynchronous fetches for youtube_id, rpc in rpcs: lang = 'en' key_name = VideoSubtitles.get_key_name(lang, youtube_id) try: resp = rpc.get_result() if resp.status_code != 200: raise RuntimeError('status code: %s' % resp.status_code) if resp.final_url: logging.warn('%s redirect to %s' % (key_name, resp.final_url)) report['redirects'] += 1 json = resp.content.decode('utf-8') # Only update stale records current = VideoSubtitles.get_by_key_name(key_name) if not current or current.json != json: new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id, language=lang, json=json) new.put() report['writes'] += 1 else: logging.info('%s content already up-to-date' % key_name) except Exception, e: logging.error('%s subtitles fetch failed: %s' % (key_name, e)) report['errors'] += 1