def test_cache_invalidation(self): """Create the cache item, then invalidate it and show that it is deleted.""" # Get a random video id n_videos = len(self.video_cache) video_id = self.video_cache.keys()[10]#random.choice(self.video_cache.keys()) sys.stdout.write("Testing on video_id = %s\n" % video_id) video_path = self.video_cache[video_id][0]['path'] # Clean the cache for this item caching.expire_page(path=video_path, failure_ok=True) # Create the cache item, and check it self.assertTrue(not caching.has_cache_key(path=video_path), "expect: no cache key after expiring the page") caching.regenerate_all_pages_related_to_videos(video_ids=[video_id]) self.assertTrue(caching.has_cache_key(path=video_path), "expect: Cache key exists after Django Client get") # Invalidate the cache item, and check it caching.invalidate_all_caches() # test the convenience function self.assertTrue(not caching.has_cache_key(path=video_path), "expect: no cache key after expiring the page")
def test_cache_invalidation(self): """Create the cache item, then invalidate it and show that it is deleted.""" # Get a random video id n_videos = len(self.video_cache) video_id = self.video_cache.keys()[ 10] #random.choice(self.video_cache.keys()) sys.stdout.write("Testing on video_id = %s\n" % video_id) video_path = self.video_cache[video_id][0]['path'] # Clean the cache for this item caching.expire_page(path=video_path, failure_ok=True) # Create the cache item, and check it self.assertTrue(not caching.has_cache_key(path=video_path), "expect: no cache key after expiring the page") caching.regenerate_all_pages_related_to_videos(video_ids=[video_id]) self.assertTrue(caching.has_cache_key(path=video_path), "expect: Cache key exists after Django Client get") # Invalidate the cache item, and check it caching.invalidate_all_caches() # test the convenience function self.assertTrue(not caching.has_cache_key(path=video_path), "expect: no cache key after expiring the page")
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects \ .filter(flagged_for_download=True, download_in_progress=False) \ .exclude(youtube_id__in=failed_youtube_ids) video_count = videos.count() if video_count == 0: self.stdout.write(_("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write((_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}) # Update the progress logging self.set_stages(num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial(self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.youtube_id}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats['downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.youtube_id, quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback)) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _("Error in downloading %(youtube_id)s: %(error_msg)s") % {"youtube_id": video.youtube_id, "error_msg": unicode(e)} self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage(stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled() and handled_youtube_ids: self.update_stage(stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos.")) caching.regenerate_all_pages_related_to_videos(video_ids=list(set([i18n.get_video_id(yid) or yid for yid in handled_youtube_ids]))) # Update self.complete(notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.") % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError( "videoscan should be run on the distributed server only.") caching_enabled = (settings.CACHE_TIME != 0) touched_video_ids = [] # Filesystem files = glob.glob(os.path.join(settings.CONTENT_ROOT, "*.mp4")) youtube_ids_in_filesystem = set( [os.path.splitext(os.path.basename(f))[0] for f in files]) # Database videos_marked_at_all = set( [video.youtube_id for video in VideoFile.objects.all()]) def delete_objects_for_incomplete_videos(): # delete VideoFile objects that are not marked as in progress, but are neither 0% nor 100% done; they're broken video_files_to_delete = VideoFile.objects.filter( download_in_progress=False, percent_complete__gt=0, percent_complete__lt=100) deleted_video_ids = [ i18n.get_video_id(video_file.youtube_id) for video_file in video_files_to_delete ] video_files_to_delete.delete() if deleted_video_ids: self.stdout.write( "Deleted %d VideoFile models (to mark them as not downloaded, since they were in a bad state)\n" % len(deleted_video_ids)) return deleted_video_ids touched_video_ids += delete_objects_for_incomplete_videos() def add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all): # Files that exist, but are not in the DB, should be assumed to be good videos, # and just needing to be added to the DB. Add them to the DB in this way, # so that these files also trigger the update code below (and trigger cache invalidation) youtube_ids_needing_model_creation = list( youtube_ids_in_filesystem - videos_marked_at_all) new_video_files = [] if youtube_ids_needing_model_creation: for lang_code, youtube_ids in divide_videos_by_language( youtube_ids_needing_model_creation).iteritems(): # OK to do bulk_create; cache invalidation triggered via save download lang_video_files = [ VideoFile(youtube_id=id, percent_complete=100, download_in_progress=False, language=lang_code) for id in youtube_ids ] VideoFile.objects.bulk_create(lang_video_files) new_video_files += lang_video_files caching.invalidate_all_caches( ) # Do this within the loop, to update users ASAP self.stdout.write( "Created %d VideoFile models (and marked them as complete, since the files exist)\n" % len(new_video_files)) return [ i18n.get_video_id(video_file.youtube_id) for video_file in new_video_files ] touched_video_ids += add_missing_objects_to_db( youtube_ids_in_filesystem, videos_marked_at_all) def update_objects_to_be_complete(youtube_ids_in_filesystem): # Files that exist, are in the DB, but have percent_complete=0, download_in_progress=False updated_video_ids = [] for chunk in break_into_chunks(youtube_ids_in_filesystem): video_files_needing_model_update = VideoFile.objects.filter( percent_complete=0, download_in_progress=False, youtube_id__in=chunk) video_files_needing_model_update.update( percent_complete=100, flagged_for_download=False) updated_video_ids += [ i18n.get_video_id(video_file.youtube_id) for video_file in video_files_needing_model_update ] if updated_video_ids: caching.invalidate_all_caches() self.stdout.write( "Updated %d VideoFile models (to mark them as complete, since the files exist)\n" % len(updated_video_ids)) return updated_video_ids touched_video_ids += update_objects_to_be_complete( youtube_ids_in_filesystem) def delete_objects_for_missing_videos(youtube_ids_in_filesystem, videos_marked_at_all): # VideoFile objects say they're available, but that don't actually exist. deleted_video_ids = [] videos_flagged_for_download = set([ video.youtube_id for video in VideoFile.objects.filter( flagged_for_download=True) ]) videos_needing_model_deletion_chunked = break_into_chunks( videos_marked_at_all - youtube_ids_in_filesystem - videos_flagged_for_download) for chunk in videos_needing_model_deletion_chunked: video_files_needing_model_deletion = VideoFile.objects.filter( youtube_id__in=chunk) video_files_needing_model_deletion.delete() deleted_video_ids += [ video_file.video_id for video_file in video_files_needing_model_deletion ] if deleted_video_ids: self.stdout.write( "Deleted %d VideoFile models (because the videos didn't exist in the filesystem)\n" % len(deleted_video_ids)) return deleted_video_ids touched_video_ids += delete_objects_for_missing_videos( youtube_ids_in_filesystem, videos_marked_at_all) if options["auto_cache"] and caching_enabled and touched_video_ids: caching.regenerate_all_pages_related_to_videos( video_ids=list(set(touched_video_ids)))
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError("videoscan should be run on the distributed server only.") caching_enabled = (settings.CACHE_TIME != 0) touched_video_ids = [] # Filesystem files = glob.glob(os.path.join(settings.CONTENT_ROOT, "*.mp4")) youtube_ids_in_filesystem = set([os.path.splitext(os.path.basename(f))[0] for f in files]) # Database videos_marked_at_all = set([video.youtube_id for video in VideoFile.objects.all()]) def delete_objects_for_incomplete_videos(): # delete VideoFile objects that are not marked as in progress, but are neither 0% nor 100% done; they're broken video_files_to_delete = VideoFile.objects.filter(download_in_progress=False, percent_complete__gt=0, percent_complete__lt=100) deleted_video_ids = [i18n.get_video_id(video_file.youtube_id) for video_file in video_files_to_delete] video_files_to_delete.delete() if deleted_video_ids: self.stdout.write("Deleted %d VideoFile models (to mark them as not downloaded, since they were in a bad state)\n" % len(deleted_video_ids)) return deleted_video_ids touched_video_ids += delete_objects_for_incomplete_videos() def add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all): # Files that exist, but are not in the DB, should be assumed to be good videos, # and just needing to be added to the DB. Add them to the DB in this way, # so that these files also trigger the update code below (and trigger cache invalidation) youtube_ids_needing_model_creation = list(youtube_ids_in_filesystem - videos_marked_at_all) new_video_files = [] if youtube_ids_needing_model_creation: for lang_code, youtube_ids in divide_videos_by_language(youtube_ids_needing_model_creation).iteritems(): # OK to do bulk_create; cache invalidation triggered via save download lang_video_files = [VideoFile(youtube_id=id, percent_complete=100, download_in_progress=False, language=lang_code) for id in youtube_ids] VideoFile.objects.bulk_create(lang_video_files) new_video_files += lang_video_files caching.invalidate_all_caches() # Do this within the loop, to update users ASAP self.stdout.write("Created %d VideoFile models (and marked them as complete, since the files exist)\n" % len(new_video_files)) return [i18n.get_video_id(video_file.youtube_id) for video_file in new_video_files] touched_video_ids += add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all) def update_objects_to_be_complete(youtube_ids_in_filesystem): # Files that exist, are in the DB, but have percent_complete=0, download_in_progress=False updated_video_ids = [] for chunk in break_into_chunks(youtube_ids_in_filesystem): video_files_needing_model_update = VideoFile.objects.filter(percent_complete=0, download_in_progress=False, youtube_id__in=chunk) video_files_needing_model_update.update(percent_complete=100, flagged_for_download=False) updated_video_ids += [i18n.get_video_id(video_file.youtube_id) for video_file in video_files_needing_model_update] if updated_video_ids: caching.invalidate_all_caches() self.stdout.write("Updated %d VideoFile models (to mark them as complete, since the files exist)\n" % len(updated_video_ids)) return updated_video_ids touched_video_ids += update_objects_to_be_complete(youtube_ids_in_filesystem) def delete_objects_for_missing_videos(youtube_ids_in_filesystem, videos_marked_at_all): # VideoFile objects say they're available, but that don't actually exist. deleted_video_ids = [] videos_flagged_for_download = set([video.youtube_id for video in VideoFile.objects.filter(flagged_for_download=True)]) videos_needing_model_deletion_chunked = break_into_chunks(videos_marked_at_all - youtube_ids_in_filesystem - videos_flagged_for_download) for chunk in videos_needing_model_deletion_chunked: video_files_needing_model_deletion = VideoFile.objects.filter(youtube_id__in=chunk) video_files_needing_model_deletion.delete() deleted_video_ids += [video_file.video_id for video_file in video_files_needing_model_deletion] if deleted_video_ids: self.stdout.write("Deleted %d VideoFile models (because the videos didn't exist in the filesystem)\n" % len(deleted_video_ids)) return deleted_video_ids touched_video_ids += delete_objects_for_missing_videos(youtube_ids_in_filesystem, videos_marked_at_all) if options["auto_cache"] and caching_enabled and touched_video_ids: caching.regenerate_all_pages_related_to_videos(video_ids=list(set(touched_video_ids)))
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects \ .filter(flagged_for_download=True, download_in_progress=False) \ .exclude(youtube_id__in=failed_youtube_ids) video_count = videos.count() if video_count == 0: self.stdout.write( _("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write( (_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}) # Update the progress logging self.set_stages( num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial( self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug( _("Retrieving youtube video %(youtube_id)s via youtube-dl" ) % {"youtube_id": video.youtube_id}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats[ 'downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.youtube_id, quiet=not settings.DEBUG, callback=partial( youtube_dl_cb, progress_callback=progress_callback)) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _( "Error in downloading %(youtube_id)s: %(error_msg)s" ) % { "youtube_id": video.youtube_id, "error_msg": unicode(e) } self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage( stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled( ) and handled_youtube_ids: self.update_stage( stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos.")) caching.regenerate_all_pages_related_to_videos(video_ids=list( set([ i18n.get_video_id(yid) or yid for yid in handled_youtube_ids ]))) # Update self.complete( notes= _("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully." ) % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise