def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError( "syncmodels cannot be run on the central server.") # Parse input parameters kwargs = {"host": args[0]} if len(args) >= 1 else {} max_retries = args[1] if len(args) >= 2 else 5 set_process_priority.lowest( logging=settings.LOG ) # don't block users from web access due to syncing # Retry purgatory self.stdout_writeln(("Checking purgatory for unsaved models") + "...") call_command("retrypurgatory") client = SyncClient(**kwargs) connection_status = client.test_connection() if connection_status != "success": self.stderr_writeln(("KA Lite host is currently unreachable") + " (%s): %s" % (connection_status, client.url)) return self.stdout_writeln(("Initiating SyncSession") + "...") result = client.start_session() if result != "success": self.stderr_writeln(("Unable to initiate session") + ": %s" % result.content) return self.stdout_writeln(("Syncing models") + "...") failure_tries = 0 while True: results = client.sync_models() upload_results = results["upload_results"] download_results = results["download_results"] # display counts for this block of models being transferred self.stdout_writeln( "\t%-15s: %d (%d failed, %d error(s))" % (("Uploaded"), upload_results["saved_model_count"], upload_results["unsaved_model_count"], upload_results.has_key("error"))) self.stdout_writeln( "\t%-15s: %d (%d failed, %d error(s))" % (("Downloaded"), download_results["saved_model_count"], download_results["unsaved_model_count"], download_results.has_key("error"))) # count the number of successes and failures success_count = upload_results[ "saved_model_count"] + download_results["saved_model_count"] fail_count = upload_results[ "unsaved_model_count"] + download_results["unsaved_model_count"] error_count = upload_results.has_key( "error") + download_results.has_key( "error") + upload_results.has_key("exceptions") # Report any errors if error_count > 0: if upload_results.has_key("error"): self.stderr_writeln( "%s: %s" % (("Upload error"), upload_results["error"])) if download_results.has_key("error"): self.stderr_writeln( "%s: %s" % (("Download error"), download_results["error"])) if upload_results.has_key("exceptions"): self.stderr_writeln("%s: %s" % (("Upload exceptions"), upload_results["exceptions"][:200])) # stop when nothing is being transferred anymore if success_count == 0 and (fail_count == 0 or failure_tries >= max_retries): break failure_tries += (fail_count > 0 and success_count == 0) # Allow the user to throttle the syncing by inserting a wait, so that users # aren't overwhelmed by the computational need for signing during sync if settings.SYNCING_THROTTLE_WAIT_TIME is not None: time.sleep(settings.SYNCING_THROTTLE_WAIT_TIME) # Report summaries self.stdout_writeln( "%s... (%s: %d, %s: %d, %s: %d)" % (("Closing session"), ("Total uploaded"), client.session.models_uploaded, ("Total downloaded"), client.session.models_downloaded, ("Total errors"), client.session.errors)) # Report any exceptions if client.session.errors: self.stderr_writeln("Completed with %d errors." % client.session.errors) if failure_tries >= max_retries: self.stderr_writeln( "%s (%d)." % ("Failed to upload all models (stopped after failed attempts)", failure_tries)) self.stdout_writeln(( "Checking purgatory once more, to try saving any unsaved models") + "...") call_command("retrypurgatory") client.close_session()
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError("syncmodels cannot be run on the central server.") # Parse input parameters kwargs = {"host": args[0]} if len(args) >= 1 else {} max_retries = args[1] if len(args) >= 2 else 5 set_process_priority.lowest(logging=settings.LOG) # don't block users from web access due to syncing # Retry purgatory self.stdout_writeln(("Checking purgatory for unsaved models")+"...") call_command("retrypurgatory") client = SyncClient(**kwargs) connection_status = client.test_connection() if connection_status != "success": self.stderr_writeln(("KA Lite host is currently unreachable") + " (%s): %s" % (connection_status, client.url)) return self.stdout_writeln(("Initiating SyncSession")+"...") result = client.start_session() if result != "success": self.stderr_writeln(("Unable to initiate session")+": %s" % result.content) return self.stdout_writeln(("Syncing models")+"...") failure_tries = 0 while True: results = client.sync_models() upload_results = results["upload_results"] download_results = results["download_results"] # display counts for this block of models being transferred self.stdout_writeln("\t%-15s: %d (%d failed, %d error(s))" % ( ("Uploaded"), upload_results["saved_model_count"], upload_results["unsaved_model_count"], upload_results.has_key("error"))) self.stdout_writeln("\t%-15s: %d (%d failed, %d error(s))" % ( ("Downloaded"), download_results["saved_model_count"], download_results["unsaved_model_count"], download_results.has_key("error"))) # count the number of successes and failures success_count = upload_results["saved_model_count"] + download_results["saved_model_count"] fail_count = upload_results["unsaved_model_count"] + download_results["unsaved_model_count"] error_count = upload_results.has_key("error") + download_results.has_key("error") + upload_results.has_key("exceptions") # Report any errors if error_count > 0: if upload_results.has_key("error"): self.stderr_writeln("%s: %s" % (("Upload error"),upload_results["error"])) if download_results.has_key("error"): self.stderr_writeln("%s: %s" % (("Download error"),download_results["error"])) if upload_results.has_key("exceptions"): self.stderr_writeln("%s: %s" % (("Upload exceptions"),upload_results["exceptions"][:200])) # stop when nothing is being transferred anymore if success_count == 0 and (fail_count == 0 or failure_tries >= max_retries): break failure_tries += (fail_count > 0 and success_count == 0) # Allow the user to throttle the syncing by inserting a wait, so that users # aren't overwhelmed by the computational need for signing during sync if settings.SYNCING_THROTTLE_WAIT_TIME is not None: time.sleep(settings.SYNCING_THROTTLE_WAIT_TIME) # Report summaries self.stdout_writeln("%s... (%s: %d, %s: %d, %s: %d)" % (("Closing session"), ("Total uploaded"), client.session.models_uploaded, ("Total downloaded"), client.session.models_downloaded, ("Total errors"), client.session.errors)) # Report any exceptions if client.session.errors: self.stderr_writeln("Completed with %d errors."%client.session.errors) if failure_tries >= max_retries: self.stderr_writeln("%s (%d)." % ("Failed to upload all models (stopped after failed attempts)",failure_tries)) self.stdout_writeln(("Checking purgatory once more, to try saving any unsaved models")+"...") call_command("retrypurgatory") client.close_session()
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects \ .filter(flagged_for_download=True, download_in_progress=False) \ .exclude(youtube_id__in=failed_youtube_ids) video_count = videos.count() if video_count == 0: self.stdout.write(_("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write((_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}) # Update the progress logging self.set_stages(num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial(self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.youtube_id}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats['downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.youtube_id, quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback)) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _("Error in downloading %(youtube_id)s: %(error_msg)s") % {"youtube_id": video.youtube_id, "error_msg": unicode(e)} self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage(stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled() and handled_youtube_ids: self.update_stage(stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos.")) caching.regenerate_all_pages_related_to_videos(video_ids=list(set([i18n.get_video_id(yid) or yid for yid in handled_youtube_ids]))) # Update self.complete(notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.") % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise
def handle(self, *args, **options): self.setup(options) self.video = {} handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet video_queue = VideoQueue() video_count = video_queue.count() if video_count == 0: self.stdout.write(_("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = video_queue.next() video["download_in_progress"] = True video["percent_complete"] = 0 self.stdout.write((_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.get("youtube_id")}) # Update the progress logging self.set_stages(num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.get("youtube_id")) # Initiate the download process try: progress_callback = partial(self.download_progress_callback, video) # Don't try to download a file that already exists in the content dir - just say it was successful # and call it a day! if not os.path.exists(os.path.join(settings.CONTENT_ROOT, "{id}.mp4".format(id=video.get("youtube_id")))): try: # Download via urllib download_video(video.get("youtube_id"), callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.get("youtube_id")}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats['downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.get("youtube_id"), quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback)) except IOError as e: logging.exception(e) failed_youtube_ids.append(video.get("youtube_id")) video_queue.remove_file(video.get("youtube_id")) time.sleep(10) continue # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.get("youtube_id")) video_queue.remove_file(video.get("youtube_id")) self.stdout.write(_("Download is complete!") + "\n") annotate_content_models_by_youtube_id(youtube_ids=[video.get("youtube_id")], language=video.get("language")) except DownloadCancelled: # Cancellation event video_queue.clear() failed_youtube_ids.append(video.get("youtube_id")) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _("Error in downloading %(youtube_id)s: %(error_msg)s") % {"youtube_id": video.get("youtube_id"), "error_msg": unicode(e)} self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.args[0] elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False # Rather than getting stuck on one video, continue to the next video. self.update_stage(stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.get("youtube_id")) video_queue.remove_file(video.get("youtube_id")) continue # Update self.complete(notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.") % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects \ .filter(flagged_for_download=True, download_in_progress=False) \ .exclude(youtube_id__in=failed_youtube_ids) video_count = videos.count() if video_count == 0: self.stdout.write( _("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write( (_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}) # Update the progress logging self.set_stages( num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial( self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug( _("Retrieving youtube video %(youtube_id)s via youtube-dl" ) % {"youtube_id": video.youtube_id}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats[ 'downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.youtube_id, quiet=not settings.DEBUG, callback=partial( youtube_dl_cb, progress_callback=progress_callback)) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") # caching.invalidate_all_caches() # Unnecessary; we have a database listener for this. except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _( "Error in downloading %(youtube_id)s: %(error_msg)s" ) % { "youtube_id": video.youtube_id, "error_msg": unicode(e) } self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage( stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled( ) and handled_youtube_ids: self.update_stage( stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos.")) caching.regenerate_all_pages_related_to_videos(video_ids=list( set([ i18n.get_video_id(yid) or yid for yid in handled_youtube_ids ]))) # Update self.complete( notes= _("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully." ) % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise
def handle(self, *args, **options): self.setup(options) self.video = {} handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=logger) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet video_queue = VideoQueue() video_count = video_queue.count() if video_count == 0: self.stdout.write(_("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = video_queue.next() video["download_in_progress"] = True video["percent_complete"] = 0 self.stdout.write((_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.get("youtube_id")}) # Update the progress logging self.set_stages(num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.get("youtube_id")) # Initiate the download process try: progress_callback = partial(self.download_progress_callback, video) # Don't try to download a file that already exists in the content dir - just say it was successful # and call it a day! if not os.path.exists(os.path.join(settings.CONTENT_ROOT, "{id}.mp4".format(id=video.get("youtube_id")))): retries = 0 while True: try: download_video(video.get("youtube_id"), callback=progress_callback) break except (socket.timeout, ConnectionError): retries += 1 msg = _( "Pausing download for '{title}', failed {failcnt} times, sleeping for 30s, retry number {retries}" ).format( title=video.get("title"), failcnt=DOWNLOAD_MAX_RETRIES, retries=retries, ) try: self.update_stage( stage_name=video.get("youtube_id"), stage_percent=0., notes=msg ) except AssertionError: # Raised by update_stage when the video # download job has ended raise DownloadCancelled() logger.info(msg) time.sleep(30) continue # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.get("youtube_id")) # Remove from item from the queue video_queue.remove_file(video.get("youtube_id")) self.stdout.write(_("Download is complete!") + "\n") annotate_content_models_by_youtube_id(youtube_ids=[video.get("youtube_id")], language=video.get("language")) except DownloadCancelled: video_queue.clear() failed_youtube_ids.append(video.get("youtube_id")) break except (HTTPError, Exception) as e: # Rather than getting stuck on one video, # completely remove this item from the queue failed_youtube_ids.append(video.get("youtube_id")) video_queue.remove_file(video.get("youtube_id")) logger.exception(e) if getattr(e, "response", None): reason = _( "Got non-OK HTTP status: {status}" ).format( status=e.response.status_code ) else: reason = _( "Unhandled request exception: " "{exception}" ).format( exception=str(e), ) msg = _( "Skipping '{title}', reason: {reason}" ).format( title=video.get('title'), reason=reason, ) # Inform the user of this problem self.update_stage( stage_name=video.get("youtube_id"), stage_percent=0., notes=msg ) logger.info(msg) continue # Update self.complete(notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.") % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: logger.exception(e) self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise