def zip_language_packs(lang_codes=None): """Zip up and expose all language packs""" lang_codes = lang_codes or listdir(LOCALE_ROOT) logging.info("Zipping up %d language pack(s)" % len(lang_codes)) ensure_dir(settings.LANGUAGE_PACK_ROOT) for lang in lang_codes: lang_locale_path = os.path.join(LOCALE_ROOT, lang) if not os.path.exists(lang_locale_path): logging.warn("Unexpectedly skipping missing directory: %s" % lang) elif not os.path.isdir(lang_locale_path): logging.error("Skipping language where a file exists: %s" % lang) # Create a zipfile for this language zip_path = os.path.join(settings.LANGUAGE_PACK_ROOT, version.VERSION) ensure_dir(zip_path) z = zipfile.ZipFile(os.path.join(zip_path, "%s.zip" % convert_language_code_format(lang)), 'w') # Get every single file in the directory and zip it up for metadata_file in glob.glob('%s/*.json' % lang_locale_path): z.write(os.path.join(lang_locale_path, metadata_file), arcname=os.path.basename(metadata_file)) for mo_file in glob.glob('%s/LC_MESSAGES/*.mo' % lang_locale_path): z.write(os.path.join(lang_locale_path, mo_file), arcname=os.path.join("LC_MESSAGES", os.path.basename(mo_file))) for srt_file in glob.glob('%s/subtitles/*.srt' % lang_locale_path): z.write(os.path.join(lang_locale_path, srt_file), arcname=os.path.join("subtitles", os.path.basename(srt_file))) z.close() logging.info("Done.")
def zip_language_packs(lang_codes=None): """Zip up and expose all language packs converts all into ietf """ lang_codes = lang_codes or os.listdir(LOCALE_ROOT) lang_codes = [lcode_to_ietf(lc) for lc in lang_codes] logging.info("Zipping up %d language pack(s)" % len(lang_codes)) for lang_code_ietf in lang_codes: lang_code_django = lcode_to_django_dir(lang_code_ietf) lang_locale_path = os.path.join(LOCALE_ROOT, lang_code_django) if not os.path.exists(lang_locale_path): logging.warn("Unexpectedly skipping missing directory: %s" % lang_code_django) elif not os.path.isdir(lang_locale_path): logging.error("Skipping language where a file exists where a directory was expected: %s" % lang_code_django) # Create a zipfile for this language zip_filepath = get_language_pack_filepath(lang_code_ietf) ensure_dir(os.path.dirname(zip_filepath)) logging.info("Creating zip file in %s" % zip_filepath) z = zipfile.ZipFile(zip_filepath, 'w', zipfile.ZIP_DEFLATED) # Get every single file in the directory and zip it up for metadata_file in glob.glob('%s/*.json' % lang_locale_path): z.write(os.path.join(lang_locale_path, metadata_file), arcname=os.path.basename(metadata_file)) srt_dirpath = get_srt_path(lang_code_django) for srt_file in glob.glob(os.path.join(srt_dirpath, "*.srt")): z.write(srt_file, arcname=os.path.join("subtitles", os.path.basename(srt_file))) z.close() logging.info("Done.")
def handle(self, *args, **options): # Get the CSV data, either from a recent cache_file # or from the internet cache_dir = settings.MEDIA_ROOT cache_file = os.path.join(cache_dir, "dubbed_videos.csv") if os.path.exists(cache_file) and datediff(datetime.datetime.now(), datetime.datetime.fromtimestamp(os.path.getctime(cache_file)), units="days") <= 14.0: # Use cached data to generate the video map csv_data = open(cache_file, "r").read() (video_map, _) = generate_dubbed_video_mappings(csv_data=csv_data) else: # Use cached data to generate the video map (video_map, csv_data) = generate_dubbed_video_mappings() try: ensure_dir(cache_dir) with open(cache_file, "w") as fp: fp.write(csv_data) except Exception as e: logging.error("Failed to make a local cache of the CSV data: %s" % e) # Now we've built the map. Save it. out_file = DUBBED_VIDEOS_MAPPING_FILE ensure_dir(os.path.dirname(out_file)) logging.info("Saving data to %s" % out_file) with open(out_file, "w") as fp: json.dump(video_map, fp) logging.info("Done.")
def handle(self, *args, **options): # Get the CSV data, either from a recent cache_file # or from the internet cache_dir = settings.MEDIA_ROOT cache_file = os.path.join(cache_dir, "dubbed_videos.csv") if os.path.exists(cache_file) and datediff( datetime.datetime.now(), datetime.datetime.fromtimestamp(os.path.getctime(cache_file)), units="days") <= 14.0: # Use cached data to generate the video map csv_data = open(cache_file, "r").read() (video_map, _) = generate_dubbed_video_mappings(csv_data=csv_data) else: # Use cached data to generate the video map (video_map, csv_data) = generate_dubbed_video_mappings() try: ensure_dir(cache_dir) with open(cache_file, "w") as fp: fp.write(csv_data) except Exception as e: logging.error( "Failed to make a local cache of the CSV data: %s" % e) # Now we've built the map. Save it. out_file = DUBBED_VIDEOS_MAPPING_FILE ensure_dir(os.path.dirname(out_file)) logging.info("Saving data to %s" % out_file) with open(out_file, "w") as fp: json.dump(video_map, fp) logging.info("Done.")
def move_exercises(lang_code): lang_pack_location = os.path.join(LOCALE_ROOT, lang_code) src_exercise_dir = os.path.join(lang_pack_location, "exercises") dest_exercise_dir = get_localized_exercise_dirpath(lang_code, is_central_server=False) if not os.path.exists(src_exercise_dir): logging.warn("Could not find downloaded exercises; skipping: %s" % src_exercise_dir) else: # Move over one at a time, to combine with any other resources that were there before. ensure_dir(dest_exercise_dir) all_exercise_files = glob.glob(os.path.join(src_exercise_dir, "*.html")) logging.info("Moving %d downloaded exercises to %s" % (len(all_exercise_files), dest_exercise_dir)) for exercise_file in all_exercise_files: shutil.move( exercise_file, os.path.join(dest_exercise_dir, os.path.basename(exercise_file))) logging.debug("Removing emtpy directory") try: shutil.rmtree(src_exercise_dir) except Exception as e: logging.error("Error removing dubbed video directory (%s): %s" % (src_exercise_dir, e))
def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path ) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn( "%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def generate_zipped_srts(lang_codes_to_update, download_path): # Create media directory if it doesn't yet exist ensure_dir(settings.MEDIA_ROOT) zip_path = settings.MEDIA_ROOT + "subtitles/" ensure_dir(zip_path) lang_codes_to_update = lang_codes_to_update or os.listdir(download_path) for lang_code in lang_codes_to_update: srt_dir = os.path.join(download_path, lang_code, "subtitles") zip_file = os.path.join(zip_path, "%s_subtitles.zip" % lang_code) # Remove any old version (as we may not re-create) if os.path.exists(zip_file): os.remove(zip_file) if not os.path.exists(srt_dir): logging.warn("No srt directory for %s; skipping." % lang_code) continue srts = glob.glob(os.path.join(srt_dir, "*.srt")) if len(srts) == 0: logging.warn("No srts for %s; skipping." % lang_code) continue logging.info("Zipping up a new pack for language code: %s" % lang_code) zf = zipfile.ZipFile(zip_file, 'w') for f in srts: zf.write(f, arcname=os.path.basename(f)) zf.close()
def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn("%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def handle(self, *args, **options): options['platform'] = options['platform'].lower() # normalize if options['platform'] not in ["all", "linux", "macos", "darwin", "windows"]: raise CommandError("Unrecognized platform: %s; will include ALL files." % options['platform']) # Step 1: recursively add all static files kalite_base = os.path.realpath(settings.PROJECT_PATH + "/../") files_dict = recursively_add_files(dirpath=kalite_base, **options) # Step 2: Add a local_settings.py file. # For distributed servers, this is a copy of the local local_settings.py, # with a few properties (specified as command-line options) overridden ls_file = create_local_settings_file(location=os.path.realpath(kalite_base+"/kalite/local_settings.py"), server_type=options['server_type'], locale=options['locale']) files_dict[ls_file] = { "dest_path": "kalite/local_settings.py" } # Step 3: select output file. if options['file']=="__default__": options['file'] = create_default_archive_filename(options) # Step 4: package into a zip file ensure_dir(os.path.realpath(os.path.dirname(options["file"]))) system_specific_zipping( files_dict = dict([(src_path, v["dest_path"]) for src_path, v in files_dict.iteritems()]), zip_file = options["file"], compression=ZIP_DEFLATED if options['compress'] else ZIP_STORED, callback=_default_callback_zip if options["verbosity"] else None, )
def download_video(youtube_id, download_path="../content/", download_url=OUTSIDE_DOWNLOAD_URL, format="mp4", callback=None): """Downloads the video file to disk (note: this does NOT invalidate any of the cached html files in KA Lite)""" ensure_dir(download_path) video_filename = "%(id)s.%(format)s" % {"id": youtube_id, "format": format} filepath = download_path + video_filename url = download_url % (video_filename, video_filename) thumb_filename = "%(id)s.png" % {"id": youtube_id} thumb_filepath = download_path + thumb_filename thumb_url = download_url % (video_filename, thumb_filename) try: path, response = download_file(url, filepath, callback_percent_proxy(callback, end_percent=95)) if not response.type.startswith("video"): raise URLNotFound("Video was not found!") path, response = download_file(thumb_url, thumb_filepath, callback_percent_proxy(callback, start_percent=95, end_percent=100)) if not response.type.startswith("image"): raise URLNotFound("Thumbnail was not found!") except DownloadCancelled: delete_downloaded_files(youtube_id, download_path) except Exception as e: delete_downloaded_files(youtube_id, download_path) raise
def update_metadata(package_metadata, version=VERSION): """ We've zipped the packages, and now have unzipped & zipped sizes. Update this info in the local metadata (but not inside the zip) """ master_filepath = get_language_pack_availability_filepath(version=version) master_metadata = softload_json(master_filepath, logger=logging.warn, errmsg="Error opening master language pack metadata") for lc, updated_meta in package_metadata.iteritems(): lang_code_ietf = lcode_to_ietf(lc) # Gather existing metadata metadata_filepath = get_language_pack_metadata_filepath(lang_code_ietf, version=version) stored_meta = softload_json(metadata_filepath, logger=logging.warn, errmsg="Error opening %s language pack metadata" % lc) stored_meta.update(updated_meta) # Write locally (this is used on download by distributed server to update it's database) with open(metadata_filepath, 'w') as output: json.dump(stored_meta, output) # Update master (this is used for central server to handle API requests for data) master_metadata[lang_code_ietf] = stored_meta # Save updated master ensure_dir(os.path.dirname(master_filepath)) with open(master_filepath, 'w') as output: json.dump(master_metadata, output) logging.info("Local record of translations updated")
def handle(self, *args, **options): if not settings.CENTRAL_SERVER: raise CommandError("Disabled for distributed servers, until we can figure out what to do with ") options['platform'] = options['platform'].lower() # normalize if options['platform'] not in ["all", "linux", "macos", "darwin", "windows"]: raise CommandError("Unrecognized platform: %s; will include ALL files." % options['platform']) # Step 0: refresh all resources get_dubbed_video_map(force=True) # force a remote download # Step 1: recursively add all static files kalite_base = os.path.realpath(settings.PROJECT_PATH + "/../") files_dict = recursively_add_files(dirpath=kalite_base, **options) # Step 2: Add a local_settings.py file. # For distributed servers, this is a copy of the local local_settings.py, # with a few properties (specified as command-line options) overridden ls_file = create_local_settings_file(location=os.path.realpath(kalite_base+"/kalite/local_settings.py"), server_type=options['server_type'], locale=options['locale'], central_server=options["central_server"]) files_dict[ls_file] = { "dest_path": "kalite/local_settings.py" } # Step 3: select output file. if not options['file']: options['file'] = create_default_archive_filename(options) # Step 4: package into a zip file ensure_dir(os.path.realpath(os.path.dirname(options["file"]))) # allows relative paths to be passed.=== system_specific_zipping( files_dict = dict([(v["dest_path"], src_path) for src_path, v in files_dict.iteritems()]), zip_file = options["file"], compression=ZIP_DEFLATED if options['compress'] else ZIP_STORED, callback=_default_callback_zip if options["verbosity"] else None, )
def generate_zipped_srts(lang_codes_to_update, download_path=DOWNLOAD_PATH): # Create media directory if it doesn't yet exist ensure_dir(settings.MEDIA_ROOT) zip_path = settings.MEDIA_ROOT + "subtitles/" ensure_dir(zip_path) lang_codes_to_update = lang_codes_to_update or os.listdir(download_path) for lang_code in lang_codes_to_update: srt_dir = os.path.join(download_path, lang_code, "subtitles") zip_file = os.path.join(zip_path, "%s_subtitles.zip" % lang_code) # Remove any old version (as we may not re-create) if os.path.exists(zip_file): os.remove(zip_file) if not os.path.exists(srt_dir): logging.warn("No srt directory for %s; skipping." % lang_code) continue srts = glob.glob(os.path.join(srt_dir, "*.srt")) if len(srts) == 0: logging.warn("No srts for %s; skipping." % lang_code) continue logging.info("Zipping up a new pack for language code: %s" % lang_code) zf = zipfile.ZipFile(zip_file, 'w') for f in srts: zf.write(f, arcname=os.path.basename(f)) zf.close()
def move_files(self): """If necessary (determined previously), move video files on disk. Otherwise, write into local_settings.""" # Move over videos if self.move_videos == "y": if os.path.exists(settings.CONTENT_ROOT): video_files = set(glob.glob(settings.CONTENT_ROOT + '*')) - set((settings.CONTENT_ROOT + "note.txt",)) else: video_files = set() sys.stdout.write("* Moving over %d files (videos and thumbnails)\n" % len(video_files)) if not os.path.exists(self.working_dir + "/content/"): os.mkdir(self.working_dir + "/content/") for video_file in video_files: shutil.move(video_file, self.working_dir + "/content/" + os.path.split(video_file)[1]) else: # write (append) fh = open(self.working_dir + "/kalite/local_settings.py", "a") fh.write("\nCONTENT_ROOT = '%s'\n" % settings.CONTENT_ROOT) fh.close() # Move inner zip file if not os.path.exists(self.inner_zip_file) or not os.path.exists(self.signature_file): sys.stderr.write("\tCould not find inner zip file / signature file for storage. Continuing...\n") else: try: zip_dir = os.path.join(self.working_dir, "kalite", "static", "zip") ensure_dir(zip_dir) shutil.move(self.inner_zip_file, os.path.join(zip_dir, os.path.basename(self.inner_zip_file))) shutil.move(self.signature_file, os.path.join(zip_dir, os.path.basename(self.signature_file))) except Exception as e: sys.stderr.write("\tCould not keep inner zip file / signature for future re-packaging (%s). Continuing...\n" % e)
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError( "Run this command on the distributed server only.") # Load videos video_sizes = softload_json(REMOTE_VIDEO_SIZE_FILEPATH, logger=logging.debug) # Query current files all_video_filepaths = glob.glob( os.path.join(settings.CONTENT_ROOT, "*.mp4")) logging.info("Querying sizes for %d video(s)." % len(all_video_filepaths)) # Get all current sizes for video_filepath in all_video_filepaths: youtube_id = os.path.splitext(os.path.basename(video_filepath))[0] # Set to max, so that local compressed videos will not affect things. video_sizes[youtube_id] = max(video_sizes.get(youtube_id, 0), os.path.getsize(video_filepath)) # Sort results video_sizes = OrderedDict([(key, video_sizes[key]) for key in sorted(video_sizes.keys())]) logging.info("Saving results to disk.") ensure_dir(os.path.dirname(REMOTE_VIDEO_SIZE_FILEPATH)) with open(REMOTE_VIDEO_SIZE_FILEPATH, "w") as fp: json.dump(video_sizes, fp, indent=2)
def update_templates(): """Update template po files""" logging.info("Posting template po files to static/pot/") ## post them to exposed URL static_path = os.path.join(settings.STATIC_ROOT, "pot/") ensure_dir(static_path) shutil.copy(os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/django.po"), os.path.join(static_path, "kalite.pot")) shutil.copy(os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/djangojs.po"), os.path.join(static_path, "kalitejs.pot"))
def unpack_language(code, zip_file): """Unpack zipped language pack into locale directory""" logging.info("Unpacking new translations") ensure_dir(os.path.join(LOCALE_ROOT, code, "LC_MESSAGES")) ## Unpack into temp dir z = zipfile.ZipFile(StringIO(zip_file)) z.extractall(os.path.join(LOCALE_ROOT, code))
def update_templates(): """Update template po files""" pot_path = os.path.join(settings.DATA_PATH_SECURE, "i18n", "pot") logging.info("Copying english po files to %s" % pot_path) # post them to exposed URL ensure_dir(pot_path) shutil.copy(os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/django.po"), os.path.join(pot_path, "kalite.pot")) shutil.copy(os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/djangojs.po"), os.path.join(pot_path, "kalitejs.pot"))
def unpack_language(lang_code, zip_filepath=None, zip_fp=None, zip_data=None): """Unpack zipped language pack into locale directory""" lang_code = lcode_to_django_dir(lang_code) logging.info("Unpacking new translations") ensure_dir(os.path.join(LOCALE_ROOT, lang_code, "LC_MESSAGES")) ## Unpack into temp dir z = zipfile.ZipFile(zip_fp or (StringIO(zip_data) if zip_data else open(zip_filepath, "rb"))) z.extractall(os.path.join(LOCALE_ROOT, lang_code))
def update_templates(): """Update template po files""" logging.info("Posting template po files to static/pot/") ## post them to exposed URL static_path = os.path.join(settings.STATIC_ROOT, "pot/") ensure_dir(static_path) shutil.copy( os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/django.po"), os.path.join(static_path, "kalite.pot")) shutil.copy( os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/djangojs.po"), os.path.join(static_path, "kalitejs.pot"))
def unpack_language(lang_code, zip_filepath=None, zip_fp=None, zip_data=None): """Unpack zipped language pack into locale directory""" lang_code = lcode_to_django_dir(lang_code) logging.info("Unpacking new translations") ensure_dir(os.path.join(LOCALE_ROOT, lang_code, "LC_MESSAGES")) ## Unpack into temp dir z = zipfile.ZipFile( zip_fp or (StringIO(zip_data) if zip_data else open(zip_filepath, "rb"))) z.extractall(os.path.join(LOCALE_ROOT, lang_code))
def update_templates(): """Update template po files""" pot_path = os.path.join(settings.DATA_PATH, "i18n", "pot") logging.info("Copying english po files to %s" % pot_path) # post them to exposed URL ensure_dir(pot_path) shutil.copy( os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/django.po"), os.path.join(pot_path, "kalite.pot")) shutil.copy( os.path.join(settings.LOCALE_PATHS[0], "en/LC_MESSAGES/djangojs.po"), os.path.join(pot_path, "kalitejs.pot"))
def _file_handler(filename=None, loggername=None): """Return a file handler with a filename specific to the logging type""" filename = filename or (loggername + ".log" if loggername else "stats.log") # Make sure that the path exists for logging to GO! ensure_dir(STATS_LOG_DIRPATH) logger_filepath = os.path.join(STATS_LOG_DIRPATH, filename) handler = logging.FileHandler(logger_filepath, encoding='utf-8', delay=True) handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s')) return handler
def extract_new_po(extract_path, combine_with_po_file=None, lang="all"): """Move newly downloaded po files to correct location in locale direction. Returns the location of the po file if a single language is given, or a list of locations if language is 'all'. """ if combine_with_po_file: assert lang != 'all', "You can only combine a po file with only one other po file. Please select a specific language, not 'all'." assert os.path.basename(combine_with_po_file) in ["django.po", "djangojs.po"], "File %s does not seem to be either django.po or djangojs.po." if lang == 'all': languages = os.listdir(extract_path) return [extract_new_po(os.path.join(extract_path, l), lang=l) for l in languages] else: converted_code = lcode_to_django_dir(lang) # ensure directory exists in locale folder, and then overwrite local po files with new ones dest_path = os.path.join(LOCALE_ROOT, converted_code, "LC_MESSAGES") ensure_dir(dest_path) dest_file = os.path.join(dest_path, 'django.po') build_file = os.path.join(dest_path, 'djangobuild.po') # so we dont clobber previous django.po that we build src_po_files = all_po_files(extract_path) concat_command = ['msgcat', '-o', build_file, '--no-location'] # filter out po files that are giving me problems src_po_files = filter(lambda po_file: not ('learn.math.trigonometry.exercises' in po_file or 'learn.math.algebra.exercises' in po_file), src_po_files) concat_command += src_po_files if combine_with_po_file and os.path.exists(combine_with_po_file): concat_command += [combine_with_po_file] backups = [sys.stdout, sys.stderr] try: sys.stdout = StringIO.StringIO() # capture output sys.stderr = StringIO.StringIO() p = subprocess.call(concat_command) out = sys.stdout.getvalue() # release output err = sys.stderr.getvalue() # release err finally: sys.stdout = backups[0] sys.stderr = backups[1] shutil.move(build_file, dest_file) return dest_file
def move_files(self): """If necessary (determined previously), move video files on disk. Otherwise, write into local_settings.""" # Move over videos if self.move_videos == "y": if os.path.exists(settings.CONTENT_ROOT): video_files = set( glob.glob(settings.CONTENT_ROOT + '*')) - set( (settings.CONTENT_ROOT + "note.txt", )) else: video_files = set() sys.stdout.write( "* Moving over %d files (videos and thumbnails)\n" % len(video_files)) if not os.path.exists(self.working_dir + "/content/"): os.mkdir(self.working_dir + "/content/") for video_file in video_files: shutil.move( video_file, self.working_dir + "/content/" + os.path.split(video_file)[1]) else: # write (append) fh = open(self.working_dir + "/kalite/local_settings.py", "a") fh.write("\nCONTENT_ROOT = '%s'\n" % settings.CONTENT_ROOT) fh.close() # Move inner zip file if not os.path.exists(self.inner_zip_file) or not os.path.exists( self.signature_file): sys.stderr.write( "\tCould not find inner zip file / signature file for storage. Continuing...\n" ) else: try: zip_dir = os.path.join(self.working_dir, "kalite", "static", "zip") ensure_dir(zip_dir) shutil.move( self.inner_zip_file, os.path.join(zip_dir, os.path.basename(self.inner_zip_file))) shutil.move( self.signature_file, os.path.join(zip_dir, os.path.basename(self.signature_file))) except Exception as e: sys.stderr.write( "\tCould not keep inner zip file / signature for future re-packaging (%s). Continuing...\n" % e)
def move_srts(code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[code]/ """ subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") srt_static_dir = os.path.join(subtitles_static_dir, code) srt_locale_dir = os.path.join(LOCALE_ROOT, code, "subtitles") ensure_dir(srt_static_dir) for fil in glob.glob(os.path.join(srt_locale_dir, "*.srt")): srt_dest_path = os.path.join(srt_static_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path) shutil.move(fil, srt_dest_path)
def move_dubbed_video_map(lang_code): lang_pack_location = os.path.join(LOCALE_ROOT, lang_code) dubbed_video_dir = os.path.join(lang_pack_location, "dubbed_videos") dvm_filepath = os.path.join(dubbed_video_dir, os.path.basename(DUBBED_VIDEOS_MAPPING_FILEPATH)) if not os.path.exists(dvm_filepath): logging.error("Could not find downloaded dubbed video filepath: %s" % dvm_filepath) else: logging.debug("Moving dubbed video map to %s" % DUBBED_VIDEOS_MAPPING_FILEPATH) ensure_dir(os.path.dirname(DUBBED_VIDEOS_MAPPING_FILEPATH)) shutil.move(dvm_filepath, DUBBED_VIDEOS_MAPPING_FILEPATH) logging.debug("Removing emtpy directory") try: shutil.rmtree(dubbed_video_dir) except Exception as e: logging.error("Error removing dubbed video directory (%s): %s" % (dubbed_video_dir, e))
def handle(self, *args, **options): if not settings.CENTRAL_SERVER: raise CommandError( "Disabled for distributed servers, until we can figure out what to do with " ) options['platform'] = options['platform'].lower() # normalize if options['platform'] not in [ "all", "linux", "macos", "darwin", "windows" ]: raise CommandError( "Unrecognized platform: %s; will include ALL files." % options['platform']) # Step 0: refresh all resources get_dubbed_video_map(force=True) # force a remote download # Step 1: recursively add all static files kalite_base = os.path.realpath(settings.PROJECT_PATH + "/../") files_dict = recursively_add_files(dirpath=kalite_base, **options) # Step 2: Add a local_settings.py file. # For distributed servers, this is a copy of the local local_settings.py, # with a few properties (specified as command-line options) overridden ls_file = create_local_settings_file( location=os.path.realpath(kalite_base + "/kalite/local_settings.py"), server_type=options['server_type'], locale=options['locale'], central_server=options["central_server"]) files_dict[ls_file] = {"dest_path": "kalite/local_settings.py"} # Step 3: select output file. if not options['file']: options['file'] = create_default_archive_filename(options) # Step 4: package into a zip file ensure_dir(os.path.realpath(os.path.dirname( options["file"]))) # allows relative paths to be passed.=== system_specific_zipping( files_dict=dict([(v["dest_path"], src_path) for src_path, v in files_dict.iteritems()]), zip_file=options["file"], compression=ZIP_DEFLATED if options['compress'] else ZIP_STORED, callback=_default_callback_zip if options["verbosity"] else None, )
def update_jsi18n_file(code="en"): """ For efficieny's sake, we want to cache Django's js18n file. So, generate that file here, then save to disk--it won't change until the next language pack update! """ output_dir = os.path.join(settings.STATIC_ROOT, "js", "i18n") ensure_dir(output_dir) output_file = os.path.join(output_dir, "%s.js" % code) request = HttpRequest() request.path = output_file request.session = {'django_language': code} response = javascript_catalog(request, packages=('ka-lite.locale', )) with open(output_file, "w") as fp: fp.write(response.content)
def extract_new_po(tmp_dir_path=os.path.join(LOCALE_ROOT, "tmp"), language_codes=[]): """Move newly downloaded po files to correct location in locale direction""" logging.info("Unpacking new translations") update_languages = os.listdir(tmp_dir_path) if language_codes: # limit based on passed in limitations update_languages = set(update_languages).intersect(set(language_codes)) for lang in update_languages: converted_code = convert_language_code_format(lang) # ensure directory exists in locale folder, and then overwrite local po files with new ones ensure_dir(os.path.join(LOCALE_ROOT, converted_code, "LC_MESSAGES")) for po_file in glob.glob(os.path.join(tmp_dir_path, lang, "*/*.po")): if "js" in os.path.basename(po_file): shutil.copy(po_file, os.path.join(LOCALE_ROOT, converted_code, "LC_MESSAGES", "djangojs.po")) else: shutil.copy(po_file, os.path.join(LOCALE_ROOT, converted_code, "LC_MESSAGES", "django.po"))
def update_jsi18n_file(code="en"): """ For efficieny's sake, we want to cache Django's js18n file. So, generate that file here, then save to disk--it won't change until the next language pack update! """ output_dir = os.path.join(settings.STATIC_ROOT, "js", "i18n") ensure_dir(output_dir) output_file = os.path.join(output_dir, "%s.js" % code) request = HttpRequest() request.path = output_file request.session = {"django_language": code} response = javascript_catalog(request, packages=("ka-lite.locale",)) with open(output_file, "w") as fp: fp.write(response.content)
def update_jsi18n_file(code="en"): """ For efficieny's sake, we want to cache Django's js18n file. So, generate that file here, then save to disk--it won't change until the next language pack update! """ translation.activate(code) # we switch the language of the whole thread output_dir = os.path.join(settings.STATIC_ROOT, "js", "i18n") ensure_dir(output_dir) output_file = os.path.join(output_dir, "%s.js" % code) request = HttpRequest() request.path = output_file request.session = {settings.LANGUAGE_COOKIE_NAME: code} response = javascript_catalog(request, packages=('ka-lite.locale',)) with open(output_file, "w") as fp: fp.write(response.content)
def move_dubbed_video_map(lang_code): lang_pack_location = os.path.join(LOCALE_ROOT, lang_code) dubbed_video_dir = os.path.join(lang_pack_location, "dubbed_videos") dvm_filepath = os.path.join( dubbed_video_dir, os.path.basename(DUBBED_VIDEOS_MAPPING_FILEPATH)) if not os.path.exists(dvm_filepath): logging.error("Could not find downloaded dubbed video filepath: %s" % dvm_filepath) else: logging.debug("Moving dubbed video map to %s" % DUBBED_VIDEOS_MAPPING_FILEPATH) ensure_dir(os.path.dirname(DUBBED_VIDEOS_MAPPING_FILEPATH)) shutil.move(dvm_filepath, DUBBED_VIDEOS_MAPPING_FILEPATH) logging.debug("Removing emtpy directory") try: shutil.rmtree(dubbed_video_dir) except Exception as e: logging.error("Error removing dubbed video directory (%s): %s" % (dubbed_video_dir, e))
def produce_outputs(src_po_files, dest_path, lang_code): # ensure directory exists in locale folder, and then overwrite local po files with new ones ensure_dir(dest_path) dest_file = os.path.join(dest_path, 'django.po') dest_mo_file = os.path.join(dest_path, 'django.mo') build_file = os.path.join( dest_path, 'djangobuild.po' ) # so we dont clobber previous django.po that we build logging.info('Concatenating all po files found...') try: build_po = polib.pofile(build_file) except IOError as e: # build_file doesn't exist yet build_po = polib.POFile(fpath=build_file) for src_file in src_po_files: if os.path.basename(src_file).startswith('kalitejs'): logging.debug('Compiling %s on its own...' % src_file) js_po_file = polib.pofile(src_file) js_mo_file = os.path.join(dest_path, 'djangojs.mo') js_po_file.save(os.path.join(dest_path, 'djangojs.po')) js_po_file.save_as_mofile(js_mo_file) else: logging.debug('Concatenating %s with %s...' % (src_file, build_file)) src_po = polib.pofile(src_file) build_po.merge(src_po) # de-obsolete messages for poentry in build_po: # ok build_po appears to be a list, but not actually one. Hence just doing # a list comprehension over it won't work. So we unobsolete entries so that # they can be detected and turned into a mo file poentry.obsolete = False build_po.save() build_po.save_as_mofile(dest_mo_file) shutil.move(build_file, dest_file) return dest_file
def move_old_subtitles(): locale_root = settings.LOCALE_PATHS[0] srt_root = os.path.join(settings.STATIC_ROOT, "srt") if os.path.exists(srt_root): logging.info("Outdated schema detected for storing srt files. Hang tight, the moving crew is on it.") for lang in os.listdir(srt_root): # Skips if not a directory if not os.path.isdir(os.path.join(srt_root, lang)): continue lang_srt_path = os.path.join(srt_root, lang, "subtitles/") lang_locale_path = os.path.join(locale_root, lang) ensure_dir(lang_locale_path) dst = os.path.join(lang_locale_path, "subtitles") for srt_file_path in glob.glob(os.path.join(lang_srt_path, "*.srt")): base_path, srt_filename = os.path.split(srt_file_path) if not os.path.exists(os.path.join(dst, srt_filename)): ensure_dir(dst) shutil.move(srt_file_path, os.path.join(dst, srt_filename)) shutil.rmtree(srt_root) logging.info("Move completed.")
def scrape_exercise(exercise_id, lang_code, force=False): ietf_lang_code = lcode_to_ietf(lang_code) exercise_dest_filepath = get_exercise_filepath(exercise_id, lang_code=lang_code) exercise_localized_root = os.path.dirname(exercise_dest_filepath) if os.path.exists(exercise_dest_filepath) and not force: return exercise_url = "https://es.khanacademy.org/khan-exercises/exercises/%s.html?lang=%s" % (exercise_id, ietf_lang_code) logging.info("Retrieving exercise %s from %s" % (exercise_id, exercise_url)) try: ensure_dir(exercise_localized_root) resp = requests.get(exercise_url) resp.raise_for_status() with open(exercise_dest_filepath, "wb") as fp: fp.write(resp.content) except Exception as e: logging.error("Failed to download %s: %s" % (exercise_url, e))
def move_exercises(lang_code): lang_pack_location = os.path.join(LOCALE_ROOT, lang_code) src_exercise_dir = os.path.join(lang_pack_location, "exercises") dest_exercise_dir = get_localized_exercise_dirpath(lang_code, is_central_server=False) if not os.path.exists(src_exercise_dir): logging.warn("Could not find downloaded exercises; skipping: %s" % src_exercise_dir) else: # Move over one at a time, to combine with any other resources that were there before. ensure_dir(dest_exercise_dir) all_exercise_files = glob.glob(os.path.join(src_exercise_dir, "*.html")) logging.info("Moving %d downloaded exercises to %s" % (len(all_exercise_files), dest_exercise_dir)) for exercise_file in all_exercise_files: shutil.move(exercise_file, os.path.join(dest_exercise_dir, os.path.basename(exercise_file))) logging.debug("Removing emtpy directory") try: shutil.rmtree(src_exercise_dir) except Exception as e: logging.error("Error removing dubbed video directory (%s): %s" % (src_exercise_dir, e))
def produce_outputs(src_po_files, dest_path, lang_code): # ensure directory exists in locale folder, and then overwrite local po files with new ones ensure_dir(dest_path) dest_file = os.path.join(dest_path, 'django.po') dest_mo_file = os.path.join(dest_path, 'django.mo') build_file = os.path.join(dest_path, 'djangobuild.po') # so we dont clobber previous django.po that we build logging.info('Concatenating all po files found...') try: build_po = polib.pofile(build_file) except IOError as e: # build_file doesn't exist yet build_po = polib.POFile(fpath=build_file) for src_file in src_po_files: if os.path.basename(src_file).startswith('kalitejs'): logging.debug('Compiling %s on its own...' % src_file) js_po_file = polib.pofile(src_file) js_mo_file = os.path.join(dest_path, 'djangojs.mo') js_po_file.save(os.path.join(dest_path, 'djangojs.po')) js_po_file.save_as_mofile(js_mo_file) else: logging.debug('Concatenating %s with %s...' % (src_file, build_file)) src_po = polib.pofile(src_file) build_po.merge(src_po) # de-obsolete messages for poentry in build_po: # ok build_po appears to be a list, but not actually one. Hence just doing # a list comprehension over it won't work. So we unobsolete entries so that # they can be detected and turned into a mo file poentry.obsolete = False build_po.save() build_po.save_as_mofile(dest_mo_file) shutil.move(build_file, dest_file) return dest_file
def download_video(youtube_id, download_path="../content/", download_url=OUTSIDE_DOWNLOAD_URL, format="mp4", callback=None): """Downloads the video file to disk (note: this does NOT invalidate any of the cached html files in KA Lite)""" ensure_dir(download_path) url, thumb_url = get_outside_video_urls(youtube_id, download_url=download_url, format=format) video_filename = "%(id)s.%(format)s" % {"id": youtube_id, "format": format} filepath = download_path + video_filename thumb_filename = "%(id)s.png" % {"id": youtube_id} thumb_filepath = download_path + thumb_filename try: path, response = download_file( url, filepath, callback_percent_proxy(callback, end_percent=95)) if not response.type.startswith("video"): raise URLNotFound("Video was not found!") path, response = download_file( thumb_url, thumb_filepath, callback_percent_proxy(callback, start_percent=95, end_percent=100)) if not response.type.startswith("image"): raise URLNotFound("Thumbnail was not found!") except DownloadCancelled: delete_downloaded_files(youtube_id, download_path) raise except Exception as e: delete_downloaded_files(youtube_id, download_path) raise
def scrape_exercise(exercise_id, lang_code, force=False): ka_lang_code = lang_code.lower() exercise_filename = "%s.%s" % (exercise_id, "html") exercise_root = os.path.join(settings.STATIC_ROOT, "js", "khan-exercises", "exercises") exercise_localized_root = os.path.join(exercise_root, ka_lang_code) exercise_dest_filepath = os.path.join(exercise_localized_root, exercise_filename) if os.path.exists(exercise_dest_filepath) and not force: return exercise_url = "https://es.khanacademy.org/khan-exercises/exercises/%s.html?lang=%s" % (exercise_id, ka_lang_code) logging.info("Retrieving exercise %s from %s" % (exercise_id, exercise_url)) try: ensure_dir(exercise_localized_root) resp = requests.get(exercise_url) resp.raise_for_status() with open(exercise_dest_filepath, "wb") as fp: fp.write(resp.content) except Exception as e: logging.error("Failed to download %s: %s" % (exercise_url, e))
def handle(self, *args, **options): options['platform'] = options['platform'].lower() # normalize if options['platform'] not in [ "all", "linux", "macos", "darwin", "windows" ]: raise CommandError( "Unrecognized platform: %s; will include ALL files." % options['platform']) # Step 1: recursively add all static files kalite_base = os.path.realpath(settings.PROJECT_PATH + "/../") files_dict = recursively_add_files(dirpath=kalite_base, **options) # Step 2: Add a local_settings.py file. # For distributed servers, this is a copy of the local local_settings.py, # with a few properties (specified as command-line options) overridden ls_file = create_local_settings_file( location=os.path.realpath(kalite_base + "/kalite/local_settings.py"), server_type=options['server_type'], locale=options['locale']) files_dict[ls_file] = {"dest_path": "kalite/local_settings.py"} # Step 3: select output file. if options['file'] == "__default__": options['file'] = create_default_archive_filename(options) # Step 4: package into a zip file ensure_dir(os.path.realpath(os.path.dirname(options["file"]))) system_specific_zipping( files_dict=dict([(src_path, v["dest_path"]) for src_path, v in files_dict.iteritems()]), zip_file=options["file"], compression=ZIP_DEFLATED if options['compress'] else ZIP_STORED, callback=_default_callback_zip if options["verbosity"] else None, )
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError("This must only be run on the distributed server.") if not options["lang_code"]: raise CommandError("You must specify a language code.") # ensure_dir(settings.CONTENT_ROOT) # Get list of videos lang_code = lcode_to_ietf(options["lang_code"]) video_map = get_dubbed_video_map(lang_code) or {} video_ids = options["video_ids"].split(",") if options["video_ids"] else None video_ids = video_ids or ([vid["id"] for vid in get_topic_videos(topic_id=options["topic_id"])] if options["topic_id"] else None) video_ids = video_ids or video_map.keys() # Download the videos for video_id in video_ids: if video_id in video_map: youtube_id = video_map[video_id] elif video_id in video_map.values(): # Perhaps they sent in a youtube ID? We can handle that! youtube_id = video_id else: logging.error("No mapping for video_id=%s; skipping" % video_id) continue try: scrape_video(youtube_id=youtube_id, format=options["format"], force=options["force"]) #scrape_thumbnail(youtube_id=youtube_id) logging.info("Access video %s at %s" % (youtube_id, get_node_cache("Video")[video_id][0]["path"])) except Exception as e: logging.error("Failed to download video %s: %s" % (youtube_id, e)) logging.info("Process complete.")
def generate_srt_availability_file(lang_code): ''' For compatibility with versions less than 0.10.3, we need to generate this json file that contains the srts for the videos. ''' # this path is a direct copy of the path found in the old function that generated this file srts_file_dest_path = os.path.join(settings.STATIC_ROOT, 'data', 'subtitles', 'languages', "%s_available_srts.json") % lang_code ensure_dir(os.path.dirname(srts_file_dest_path)) srts_path = get_srt_path(lang_code) # not sure yet about this; change once command is complete try: files = os.listdir(srts_path) except OSError: # directory doesnt exist or we cant read it files = [] yt_ids = [f.rstrip(".srt") for f in files] srts_dict = { 'srt_files': yt_ids } with open(srts_file_dest_path, 'wb') as fp: logging.debug('Creating %s', srts_file_dest_path) json.dump(srts_dict, fp) return yt_ids
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError("Run this command on the distributed server only.") # Load videos video_sizes = softload_json(REMOTE_VIDEO_SIZE_FILEPATH, logger=logging.debug) # Query current files all_video_filepaths = glob.glob(os.path.join(settings.CONTENT_ROOT, "*.mp4")) logging.info("Querying sizes for %d video(s)." % len(all_video_filepaths)) # Get all current sizes for video_filepath in all_video_filepaths: youtube_id = os.path.splitext(os.path.basename(video_filepath))[0] # Set to max, so that local compressed videos will not affect things. video_sizes[youtube_id] = max(video_sizes.get(youtube_id, 0), os.path.getsize(video_filepath)) # Sort results video_sizes = OrderedDict([(key, video_sizes[key]) for key in sorted(video_sizes.keys())]) logging.info("Saving results to disk.") ensure_dir(os.path.dirname(REMOTE_VIDEO_SIZE_FILEPATH)) with open(REMOTE_VIDEO_SIZE_FILEPATH, "w") as fp: json.dump(video_sizes, fp, indent=2)
def scrape_exercise(exercise_id, lang_code, force=False): ietf_lang_code = lcode_to_ietf(lang_code) exercise_dest_filepath = get_exercise_filepath(exercise_id, lang_code=lang_code) exercise_localized_root = os.path.dirname(exercise_dest_filepath) if os.path.exists(exercise_dest_filepath) and not force: return exercise_url = "https://es.khanacademy.org/khan-exercises/exercises/%s.html?lang=%s" % ( exercise_id, ietf_lang_code) logging.info("Retrieving exercise %s from %s" % (exercise_id, exercise_url)) try: ensure_dir(exercise_localized_root) resp = requests.get(exercise_url) resp.raise_for_status() with open(exercise_dest_filepath, "wb") as fp: fp.write(resp.content) except Exception as e: logging.error("Failed to download %s: %s" % (exercise_url, e))
def update_srt_availability(lang_code): """Update maps in srts_by_lanugage with ids of downloaded subs""" srts_path = settings.STATIC_ROOT + "srt/" # Get a list of all srt files lang_srts_path = srts_path + lang_code + "/subtitles/" if not os.path.exists(lang_srts_path): # this happens when we tried to get srts, but none existed. yt_ids = [] else: files = os.listdir(lang_srts_path) yt_ids = [f.rstrip(".srt") for f in files] srts_dict = { "srt_files": yt_ids } # Dump that to the language path base_path = settings.SUBTITLES_DATA_ROOT + "languages/" ensure_dir(base_path) filename = "%s_available_srts.json" % lang_code filepath = base_path + filename with open(filepath, 'wb') as fp: # overwrite file json.dump(srts_dict, fp) return yt_ids
def obliterate_old_schema(): """Move srt files from static/srt to locale directory and file them by language code, delete any old locale directories""" srt_root = os.path.join(settings.STATIC_ROOT, "srt") for locale_root in settings.LOCALE_PATHS: if not os.path.exists(locale_root): continue for lang in os.listdir(locale_root): # Skips if not a directory if not os.path.isdir(os.path.join(locale_root, lang)): continue # If it isn't crowdin/django format, keeeeeeellllllll if lang != convert_language_code_format(lang): logging.info("Deleting %s directory because it does not fit our language code format standards" % lang) shutil.rmtree(os.path.join(locale_root, lang)) if os.path.exists(os.path.join(settings.STATIC_ROOT, "srt")): logging.info("Outdated schema detected for storing srt files. Hang tight, the moving crew is on it.") for lang in os.listdir(srt_root): # Skips if not a directory if not os.path.isdir(os.path.join(srt_root, lang)): continue lang_srt_path = os.path.join(srt_root, lang, "subtitles/") lang_locale_path = os.path.join(locale_root, lang) ensure_dir(lang_locale_path) dst = os.path.join(lang_locale_path, "subtitles") for srt_file_path in glob.glob(os.path.join(lang_srt_path, "*.srt")): base_path, srt_filename = os.path.split(srt_file_path) if not os.path.exists(os.path.join(dst, srt_filename)): ensure_dir(dst) shutil.move(srt_file_path, os.path.join(dst, srt_filename)) shutil.rmtree(srt_root) logging.info("Move completed.")
def update_metadata(package_metadata, version=VERSION): """ We've zipped the packages, and now have unzipped & zipped sizes. Update this info in the local metadata (but not inside the zip) """ master_filepath = get_language_pack_availability_filepath(version=version) master_metadata = softload_json( master_filepath, logger=logging.warn, errmsg="Error opening master language pack metadata") for lc, updated_meta in package_metadata.iteritems(): lang_code_ietf = lcode_to_ietf(lc) # Gather existing metadata metadata_filepath = get_language_pack_metadata_filepath( lang_code_ietf, version=version) stored_meta = softload_json( metadata_filepath, logger=logging.warn, errmsg="Error opening %s language pack metadata" % lc) stored_meta.update(updated_meta) # Write locally (this is used on download by distributed server to update it's database) with open(metadata_filepath, 'w') as output: json.dump(stored_meta, output) # Update master (this is used for central server to handle API requests for data) master_metadata[lang_code_ietf] = stored_meta # Save updated master ensure_dir(os.path.dirname(master_filepath)) with open(master_filepath, 'w') as output: json.dump(master_metadata, output) logging.info("Local record of translations updated")
def update_srt_availability(lang_code): """Update maps in srts_by_lanugage with ids of downloaded subs""" srts_path = settings.STATIC_ROOT + "srt/" # Get a list of all srt files lang_srts_path = srts_path + lang_code + "/subtitles/" if not os.path.exists(lang_srts_path): # this happens when we tried to get srts, but none existed. yt_ids = [] else: files = os.listdir(lang_srts_path) yt_ids = [f.rstrip(".srt") for f in files] srts_dict = {"srt_files": yt_ids} # Dump that to the language path base_path = settings.SUBTITLES_DATA_ROOT + "languages/" ensure_dir(base_path) filename = "%s_available_srts.json" % lang_code filepath = base_path + filename with open(filepath, 'wb') as fp: # overwrite file json.dump(srts_dict, fp) return yt_ids
def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None, map_file=SRTS_JSON_FILEPATH): """ Write or update JSON file that maps from YouTube ID to Amara code and languages available. This command updates the json file that records what languages videos have been subtitled in. It loops through all video ids, records a list of which languages Amara says it has been subtitled in and meta data about the request (e.g. date, response code). See the schema in the docstring for fcn update_video_entry. """ youtube_ids = get_slug2id_map().values() # Initialize the data if not os.path.exists(map_file): ensure_dir(os.path.dirname(map_file)) srts_dict = {} else: # Open the file, read, and clean out old videos. try: with open(map_file, "r") as fp: srts_dict = json.load(fp) except Exception as e: if not force: # only handle the error if force=True. Otherwise, these data are too valuable to lose, so just assume a temp problem. raise else: logging.error("JSON file corrupted, using empty json and starting from scratch (%s)" % e) srts_dict = {} else: logging.info("Loaded %d mappings." % (len(srts_dict))) # Set of videos no longer used by KA Lite removed_videos = set(srts_dict.keys()) - set(youtube_ids) if removed_videos: logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos)) for vid in removed_videos: del srts_dict[vid] logging.info("Querying %d mappings." % (len(youtube_ids) - (0 if (force or date_to_check) else len(srts_dict)))) # Once we have the current mapping, proceed through logic to update the mapping n_refreshed = 0 # keep track to avoid writing if nothing's been refreshed. n_new_entries = 0 # keep track for reporting n_failures = 0 # keep track for reporting for youtube_id in youtube_ids: # Decide whether or not to update this video based on the arguments provided at the command line cached = youtube_id in srts_dict if not force and cached: # First, check against date flag_for_refresh = True # not (response_code or last_attempt) last_attempt = srts_dict[youtube_id].get("last_attempt") last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, '%Y-%m-%d') flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt) if not flag_for_refresh: logging.debug("Skipping %s for date-check" % youtube_id) continue # Second, check against response code response_code = srts_dict[youtube_id].get("api_response") flag_for_refresh = flag_for_refresh and (not response_to_check or response_to_check == "all" or response_to_check == response_code) if not (flag_for_refresh): logging.debug("Skipping %s for response-code" % youtube_id) continue if not response_to_check and not date_to_check and cached: # no flags specified and already cached - skip logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id) continue # We're gonna check; just report the reason why. if force and not cached: logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id) elif force and cached: logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id) else: logging.debug("Updating %s because video not yet cached." % youtube_id) # If it makes it to here without hitting a continue, then update the entry try: srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {})) n_refreshed += 1 except Exception as e: logging.warn("Error updating video %s: %s" % (youtube_id, e)) n_failures += 1 continue if n_new_entries % frequency_to_save == 0: logging.info("On loop %d dumping dictionary into %s" % (n_new_entries, map_file)) with open(map_file, 'wb') as fp: json.dump(srts_dict, fp) n_new_entries += 1 # Finished the loop: save and report if n_refreshed > 0: with open(map_file, 'wb') as fp: json.dump(srts_dict, fp) if n_failures == 0: logging.info("Great success! Added %d entries, updated %d entries, of %d total." % (n_new_entries, n_refreshed, len(srts_dict))) else: logging.warn("Stored %d new entries, refreshed %d entries, but with %s failures, of %d total." % (n_new_entries, n_refreshed, n_failures, len(srts_dict))) return n_refreshed != 0
def update_language_srt_map(): """Update the language_srt_map from the api_info_map""" # Load the current download status try: api_info_map = json.loads(open(settings.SUBTITLES_DATA_ROOT + SRTS_JSON_FILENAME).read()) except Exception as e: # Must be corrupted; start from scratch! logging.warn("Could not open %s for updates; starting from scratch. Error=%s" % (srt_download_info_filepath, e)) api_info_map = {} # Next we want to iterate through those and create a big srt dictionary organized by language code remote_availability_map = {} for youtube_id, data in api_info_map.items(): languages = data.get("language_codes", []) for lang_code in languages: if not lang_code in remote_availability_map: #logging.info("Creating language section '%s'" % lang_code) remote_availability_map[lang_code] = {} # This entry will be valid if it's new, otherwise it will be overwitten later remote_availability_map[lang_code][youtube_id] = { "downloaded": False, "api_response": "", "last_attempt": "", "last_success": "", } # Finally we need to iterate through that dictionary and create individual files for each language code for lang_code, new_data in remote_availability_map.items(): # Try to open previous language file lang_map_filepath = get_lang_map_filepath(lang_code) if not os.path.exists(lang_map_filepath): lang_map = {} else: try: lang_map = json.loads(open(lang_map_filepath).read()) except: logging.debug("Language download status mapping for (%s) is corrupted, rewriting it." % lang_code) lang_map = {} # First, check to see if it's empty (e.g. no subtitles available for any videos) if not new_data: logging.info("Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(lang_map_filepath): os.remove(lang_map_filepath) continue # Compare how many empty entries you are adding and add them to master map old_yt_ids = set(new_data.keys()) new_yt_ids = set(lang_map.keys()) yt_ids_to_add = set(new_data.keys()) - set(lang_map.keys()) yt_ids_to_delete = set(lang_map.keys()) - set(new_data.keys()) if yt_ids_to_add: logging.info("Adding %d new YouTube IDs to language (%s)" % (len(yt_ids_to_add), lang_code)) for yt_id in yt_ids_to_add: lang_map[yt_id] = new_data.get(yt_id) if yt_ids_to_delete: logging.info("Deleting %d old YouTube IDs from language (%s) because they are no longer supported." %(len(yt_ids_to_delete), lang_code)) for yt_id in yt_ids_to_delete: lang_map.pop(yt_id, None) # Write the new file to the correct location logging.info("Writing %s" % lang_map_filepath) ensure_dir(os.path.dirname(lang_map_filepath)) with open(lang_map_filepath, 'w') as outfile: json.dump(lang_map, outfile) # Update the big mapping with the most accurate numbers remote_availability_map[lang_code].update(lang_map) # Finally, remove any files not found in the current map at all. for filename in os.listdir(os.path.dirname(lang_map_filepath)): lang_code = lang_code = filename.split("_")[0] if not lang_code in remote_availability_map: file_to_remove = get_lang_map_filepath(lang_code) logging.info("Subtitle support for %s has been terminated; removing." % lang_code) os.remove(file_to_remove) return remote_availability_map
def update_language_srt_map(map_file=SRTS_JSON_FILEPATH): """ Translate the srts_remote_availability dictionary into language specific files that can be used by the cache_subtitles command. Note: srt map deals with amara, so uses ietf codes (e.g. en-us) """ # Load the current download status try: with open(map_file) as fp: api_info_map = json.load(fp) except Exception as e: # Must be corrupted; start from scratch! logging.warn("Could not open %s for updates; starting from scratch. Error=%s" % (map_file, e)) api_info_map = {} # Next we want to iterate through those and create a big srt dictionary organized by language code remote_availability_map = {} for youtube_id, data in api_info_map.items(): languages = data.get("language_codes", []) for lang_code in languages: lang_code = lcode_to_ietf(lang_code) if not lang_code in remote_availability_map: #logging.info("Creating language section '%s'" % lang_code) remote_availability_map[lang_code] = {} # This entry will be valid if it's new, otherwise it will be overwitten later remote_availability_map[lang_code][youtube_id] = { "downloaded": False, "api_response": "", "last_attempt": "", "last_success": "", } # Finally we need to iterate through that dictionary and create individual files for each language code for lang_code, new_data in remote_availability_map.items(): # Try to open previous language file lang_map_filepath = get_lang_map_filepath(lang_code) if not os.path.exists(lang_map_filepath): lang_map = {} else: try: with open(lang_map_filepath, "r") as fp: lang_map = json.load(fp) except Exception as e: logging.error("Language download status mapping for (%s) is corrupted (%s), rewriting it." % (lang_code, e)) lang_map = {} # First, check to see if it's empty (e.g. no subtitles available for any videos) if not new_data: logging.info("Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(lang_map_filepath): os.remove(lang_map_filepath) continue # Compare how many empty entries you are adding and add them to master map old_yt_ids = set(new_data.keys()) new_yt_ids = set(lang_map.keys()) yt_ids_to_add = set(new_data.keys()) - set(lang_map.keys()) yt_ids_to_delete = set(lang_map.keys()) - set(new_data.keys()) if yt_ids_to_add: logging.info("Adding %d new YouTube IDs to language (%s)" % (len(yt_ids_to_add), lang_code)) for yt_id in yt_ids_to_add: lang_map[yt_id] = new_data.get(yt_id) if yt_ids_to_delete: logging.info("Deleting %d old YouTube IDs from language (%s) because they are no longer supported." % (len(yt_ids_to_delete), lang_code)) for yt_id in yt_ids_to_delete: lang_map.pop(yt_id, None) # Write the new file to the correct location logging.debug("Writing %s" % lang_map_filepath) ensure_dir(os.path.dirname(lang_map_filepath)) with open(lang_map_filepath, 'w') as outfile: json.dump(lang_map, outfile) # Update the big mapping with the most accurate numbers remote_availability_map[lang_code].update(lang_map) # Finally, remove any files not found in the current map at all. if lang_map_filepath: for filename in os.listdir(os.path.dirname(lang_map_filepath)): lang_code = lang_code = filename.split("_")[0] if not lang_code in remote_availability_map: file_to_remove = get_lang_map_filepath(lang_code) logging.info("Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(file_to_remove): os.remove(file_to_remove) else: logging.warn("Subtitles metadata for %s not found; skipping deletion of non-existent file %s." % (lang_code, file_to_remove)) return remote_availability_map
def test_file(self): with self.assertRaisesRegexp(OSError, 'Not a directory'): ensure_dir(self.filename)
def download_if_criteria_met( videos, lang_code, force, response_code, date_since_attempt, frequency_to_save, *args, **kwargs ): """Execute download of subtitle if it meets the criteria specified by the command line args Note: videos are a dict; keys=youtube_id, values=data Note: lang_code is in IETF format. """ date_specified = convert_date_input(date_since_attempt) # Filter up front, for efficiency (& reporting's sake) n_videos = len(videos) logging.info( "There are (up to) %s total videos with subtitles for language '%s'. Let's go get them!" % (n_videos, lang_code) ) # Filter based on response code if response_code and response_code != "all": logging.info("Filtering based on response code (%s)..." % response_code) response_code_filter = partial(lambda vid, rcode: rcode == vid["api_response"], rcode=response_code) videos = dict([(k, v) for k, v in videos.iteritems() if response_code_filter(v)]) logging.info( "%4d of %4d videos match your specified response code (%s)" % (len(videos), n_videos, response_code) ) if date_specified: logging.info("Filtering based on date...") videos_copy = copy.deepcopy(videos) for k, v in videos.items(): if not v["last_attempt"] or datetime.datetime.strptime(v["last_attempt"], "%Y-%m-%d") < date_specified: continue else: del videos_copy[k] videos = videos_copy logging.info( "%4d of %4d videos need refreshing (last refresh more recent than %s)" % (len(videos), n_videos, date_specified) ) # Loop over videos needing refreshing n_loops = 0 srt_count = None for youtube_id, entry in videos.items(): previously_downloaded = entry.get("downloaded") if previously_downloaded and not force: logging.info("Already downloaded %s/%s. To redownload, run again with -f." % (lang_code, youtube_id)) continue logging.debug("Attempting to download subtitle for lang: %s and YouTube ID: %s" % (lang_code, youtube_id)) response = download_subtitle(youtube_id, lang_code, format="srt") time_of_attempt = unicode(datetime.datetime.now().date()) if response == "client-error" or response == "server-error": # Couldn't download logging.info("%s/%s.srt: Updating JSON file to record error (%s)." % (lang_code, youtube_id, response)) update_json(youtube_id, lang_code, previously_downloaded, response, time_of_attempt) else: dirpath = get_srt_path(lang_code) fullpath = os.path.join(dirpath, youtube_id + ".srt") ensure_dir(dirpath) logging.debug("Writing file to %s" % fullpath) with open(fullpath, "w") as fp: fp.write(response.encode("UTF-8")) logging.info("%s/%s.srt: Updating JSON file to record success." % (lang_code, youtube_id)) update_json(youtube_id, lang_code, True, "success", time_of_attempt) # Update srt availability mapping n_loops += 1 if n_loops % frequency_to_save == 0 or n_loops == len(videos.keys()): srt_count = store_new_counts(lang_code=lang_code) logging.info( "%s: On loop %d / %d, stored: subtitle count = %d." % (lang_code, n_loops, len(videos), srt_count) ) # Summarize output if srt_count is None: # only none if nothing was done. logging.info("Nothing was done.") else: logging.info( "We now have %d subtitles (amara thought they had %d) for language '%s'!" % (srt_count, n_videos, lang_code) )
def test_new_dir_after_file(self): newdir = os.path.join(self.filename, 'newdir') with self.assertRaisesRegexp(OSError, 'Not a directory'): ensure_dir(newdir) self.assertNotExists(newdir)
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects \ .filter(flagged_for_download=True, download_in_progress=False) \ .exclude(youtube_id__in=failed_youtube_ids) video_count = videos.count() if video_count == 0: self.stdout.write(_("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write((_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}) # Update the progress logging self.set_stages(num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial(self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug(_("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.youtube_id}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats['downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.youtube_id, quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback)) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _("Error in downloading %(youtube_id)s: %(error_msg)s") % {"youtube_id": video.youtube_id, "error_msg": unicode(e)} self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage(stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled() and handled_youtube_ids: self.update_stage(stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos.")) caching.regenerate_all_pages_related_to_videos(video_ids=list(set([i18n.get_video_id(yid) or yid for yid in handled_youtube_ids]))) # Update self.complete(notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.") % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise