def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format(topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get("description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None)
def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format( topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get( "description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None)
def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning("Found a duplicate ID for {}, re-downloading".format(node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format(video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info("Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen(cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking ffmpeg: {}".format((_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format(" ".join(cmd))) raise CommandError("Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join( node["path"], video_file_name ) copy_media.videos_found += 1 logger.info("Videos processed: {}".format(copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error("Failed to create thumbnail for {}".format(video_file_src)) else: logger.info("Successfully created thumbnail for {}".format(video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png' ) if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join( subtitle_src_dir, node['id'] + '.srt' ) if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join( node_dir, node['id'] + '.vtt' ) # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format(subtitle_srt)) else: logger.info("Subtitle convert from SRT to VTT: {}".format(subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt' ) else: if options['download']: logger.error("File not found or downloaded: {}".format(video_file_src)) else: logger.error("Invalid node, kind: {}".format(node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get("children", []) unavailable_video = child["kind"] == "Video" and not child.get("content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Takes exactly 1 argument") dest_file = os.path.abspath(args[0]) logger.info("Starting up KA Lite export2zim command") beginning = datetime.now() logger.info("Begin: {}".format(beginning)) language = options.get('language') if not language: raise CommandError("Must specify a language!") if not options.get('tmp_dir'): tmp_dir = os.path.join(tempfile.gettempdir(), 'ka-lite-zim_{}'.format(language)) else: tmp_dir = options.get('tmp_dir') tmp_dir = os.path.abspath(tmp_dir) if os.path.exists(tmp_dir) and os.listdir(tmp_dir): if options['clear']: logger.info("Clearing directory {}".format(tmp_dir)) shutil.rmtree(tmp_dir) elif options['resume']: logger.info("Resuming in dirty tmp directory {}".format(tmp_dir)) else: raise CommandError( "{} not empty, use the -c option to clean it, -r to resume, or use an empty destination directory.".format( tmp_dir ) ) zimwriterfs = options.get("zimwriterfs", None) publisher = options.get("publisher") transcode2webm = options.get("transcode2webm") ffmpeg = find_executable("ffmpeg") if not ffmpeg: logger.warning("FFMpeg not found in your path, you won't be able to create missing thumbnails or transcode to webm.") if not zimwriterfs: zimwriterfs = find_executable("zimwriterfs") if not zimwriterfs: raise CommandError("Could not find zimwriterfs in your path, try specifying --zimwriterfs=/path") if not os.path.exists(zimwriterfs): raise CommandError("Invalid --zimwriterfs") from kalite_zim import __name__ as base_path base_path = os.path.abspath(base_path) data_path = os.path.join(base_path, 'data') # Where subtitles are found in KA Lite subtitle_src_dir = i18n.get_srt_path(language) logger.info("Will export videos for language: {}".format(language)) logger.info("Preparing KA Lite topic tree...") # Use live data if not options.get('test'): # This way of doing things will be deprecated in KA Lite 0.16 topic_tree_json_path = topic_tools_settings.TOPICS_FILEPATHS.get('khan') content_cache = get_content_cache(language=language, annotate=True) exercise_cache = get_exercise_cache(language=language) # Use test data else: topic_tree_json_path = os.path.join(data_path, 'test_topics.json') content_cache = json.load( open(os.path.join(data_path, 'test_content.json')) ) exercise_cache = json.load( open(os.path.join(data_path, 'test_exercise.json')) ) topic_tree = softload_json(topic_tree_json_path, logger=logger.debug, raises=False) content_json_output = {} exercise_json_output = {} def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format(topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get("description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None) # 1. Annotate a topic tree annotate_tree(topic_tree) # 2. Now go through the tree and copy each element into the destination # zim file system def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning("Found a duplicate ID for {}, re-downloading".format(node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format(video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info("Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen(cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking ffmpeg: {}".format((_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format(" ".join(cmd))) raise CommandError("Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join( node["path"], video_file_name ) copy_media.videos_found += 1 logger.info("Videos processed: {}".format(copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error("Failed to create thumbnail for {}".format(video_file_src)) else: logger.info("Successfully created thumbnail for {}".format(video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png' ) if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join( subtitle_src_dir, node['id'] + '.srt' ) if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join( node_dir, node['id'] + '.vtt' ) # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format(subtitle_srt)) else: logger.info("Subtitle convert from SRT to VTT: {}".format(subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt' ) else: if options['download']: logger.error("File not found or downloaded: {}".format(video_file_src)) else: logger.error("Invalid node, kind: {}".format(node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get("children", []) unavailable_video = child["kind"] == "Video" and not child.get("content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children copy_media.videos_found = 0 def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child) render_topic_pages.pages_rendered = 0 logger.info("Hard linking video files from KA Lite...") copy_media(topic_tree) sys.stderr.write("\n") logger.info("Done!") # Configure django-compressor compressor_init(os.path.join(base_path, 'static')) # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "welcome": True, } with i18n.translate_block(language): welcome_html = render_to_string("kalite_zim/welcome.html", template_context) about_html = render_to_string("kalite_zim/about.html", template_context) # Replace absolute references to '/static' with relative welcome_html = welcome_html.replace("/static", "static") about_html = about_html.replace("/static", "static") # Write the welcome.html file open(os.path.join(tmp_dir, 'welcome.html'), 'w').write(welcome_html) open(os.path.join(tmp_dir, 'about.html'), 'w').write(about_html) # Render all topic html files render_topic_pages(topic_tree) # Copy in static data after it's been handled by django compressor # (this happens during template rendering) shutil.copytree(os.path.join(base_path, 'static'), os.path.join(tmp_dir, 'static')) ending = datetime.now() duration = int((ending - beginning).total_seconds()) logger.info("Total number of videos found: {}".format(copy_media.videos_found)) logger.info("Total number of topic pages created: {}".format(render_topic_pages.pages_rendered)) logger.info("Invoking zimwriterfs, writing to: {}".format(dest_file)) zimwriterfs_args = ( zimwriterfs, "--welcome", "welcome.html", "--favicon", "static/img/ka_leaf.png", "--publisher", publisher, "--creator", "KhanAcademy.org", "--description", "Khan Academy ({})".format(language), "--description", "Videos from Khan Academy", "--language", language, tmp_dir, dest_file, ) process = subprocess.Popen(zimwriterfs_args, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking zimwriterfs: {}").format(_stderr_data + stdout_data) logger.info( "Duration: {h:} hours, {m:} minutes, {s:} seconds".format( h=duration // 3600, m=(duration % 3600) // 60, s=duration % 60, ) )
def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning( "Found a duplicate ID for {}, re-downloading".format( node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format( video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info( "Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen( cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate( ) if process.returncode != 0: logger.error( "Error invoking ffmpeg: {}".format( (_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format( " ".join(cmd))) raise CommandError( "Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join(node["path"], video_file_name) copy_media.videos_found += 1 logger.info("Videos processed: {}".format( copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error( "Failed to create thumbnail for {}".format( video_file_src)) else: logger.info( "Successfully created thumbnail for {}".format( video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png') if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join(subtitle_src_dir, node['id'] + '.srt') if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join(node_dir, node['id'] + '.vtt') # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format( subtitle_srt)) else: logger.info( "Subtitle convert from SRT to VTT: {}".format( subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt') else: if options['download']: logger.error("File not found or downloaded: {}".format( video_file_src)) else: logger.error("Invalid node, kind: {}".format( node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get( "children", []) unavailable_video = child["kind"] == "Video" and not child.get( "content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Takes exactly 1 argument") dest_file = os.path.abspath(args[0]) logger.info("Starting up KA Lite export2zim command") beginning = datetime.now() logger.info("Begin: {}".format(beginning)) language = options.get('language') if not language: raise CommandError("Must specify a language!") if not options.get('tmp_dir'): tmp_dir = os.path.join(tempfile.gettempdir(), 'ka-lite-zim_{}'.format(language)) else: tmp_dir = options.get('tmp_dir') tmp_dir = os.path.abspath(tmp_dir) if os.path.exists(tmp_dir) and os.listdir(tmp_dir): if options['clear']: logger.info("Clearing directory {}".format(tmp_dir)) shutil.rmtree(tmp_dir) elif options['resume']: logger.info( "Resuming in dirty tmp directory {}".format(tmp_dir)) else: raise CommandError( "{} not empty, use the -c option to clean it, -r to resume, or use an empty destination directory." .format(tmp_dir)) zimwriterfs = options.get("zimwriterfs", None) publisher = options.get("publisher") transcode2webm = options.get("transcode2webm") ffmpeg = find_executable("ffmpeg") if not ffmpeg: logger.warning( "FFMpeg not found in your path, you won't be able to create missing thumbnails or transcode to webm." ) if not zimwriterfs: zimwriterfs = find_executable("zimwriterfs") if not zimwriterfs: raise CommandError( "Could not find zimwriterfs in your path, try specifying --zimwriterfs=/path" ) if not os.path.exists(zimwriterfs): raise CommandError("Invalid --zimwriterfs") from kalite_zim import __name__ as base_path base_path = os.path.abspath(base_path) data_path = os.path.join(base_path, 'data') # Where subtitles are found in KA Lite subtitle_src_dir = i18n.get_srt_path(language) logger.info("Will export videos for language: {}".format(language)) logger.info("Preparing KA Lite topic tree...") # Use live data if not options.get('test'): # This way of doing things will be deprecated in KA Lite 0.16 topic_tree_json_path = topic_tools_settings.TOPICS_FILEPATHS.get( 'khan') content_cache = get_content_cache(language=language, annotate=True) exercise_cache = get_exercise_cache(language=language) # Use test data else: topic_tree_json_path = os.path.join(data_path, 'test_topics.json') content_cache = json.load( open(os.path.join(data_path, 'test_content.json'))) exercise_cache = json.load( open(os.path.join(data_path, 'test_exercise.json'))) topic_tree = softload_json(topic_tree_json_path, logger=logger.debug, raises=False) content_json_output = {} exercise_json_output = {} def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format( topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get( "description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None) # 1. Annotate a topic tree annotate_tree(topic_tree) # 2. Now go through the tree and copy each element into the destination # zim file system def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning( "Found a duplicate ID for {}, re-downloading".format( node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format( video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info( "Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen( cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate( ) if process.returncode != 0: logger.error( "Error invoking ffmpeg: {}".format( (_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format( " ".join(cmd))) raise CommandError( "Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join(node["path"], video_file_name) copy_media.videos_found += 1 logger.info("Videos processed: {}".format( copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error( "Failed to create thumbnail for {}".format( video_file_src)) else: logger.info( "Successfully created thumbnail for {}".format( video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png') if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join(subtitle_src_dir, node['id'] + '.srt') if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join(node_dir, node['id'] + '.vtt') # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format( subtitle_srt)) else: logger.info( "Subtitle convert from SRT to VTT: {}".format( subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt') else: if options['download']: logger.error("File not found or downloaded: {}".format( video_file_src)) else: logger.error("Invalid node, kind: {}".format( node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get( "children", []) unavailable_video = child["kind"] == "Video" and not child.get( "content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children copy_media.videos_found = 0 def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child) render_topic_pages.pages_rendered = 0 logger.info("Hard linking video files from KA Lite...") copy_media(topic_tree) sys.stderr.write("\n") logger.info("Done!") # Configure django-compressor compressor_init(os.path.join(base_path, 'static')) # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "welcome": True, } with i18n.translate_block(language): welcome_html = render_to_string("kalite_zim/welcome.html", template_context) about_html = render_to_string("kalite_zim/about.html", template_context) # Replace absolute references to '/static' with relative welcome_html = welcome_html.replace("/static", "static") about_html = about_html.replace("/static", "static") # Write the welcome.html file open(os.path.join(tmp_dir, 'welcome.html'), 'w').write(welcome_html) open(os.path.join(tmp_dir, 'about.html'), 'w').write(about_html) # Render all topic html files render_topic_pages(topic_tree) # Copy in static data after it's been handled by django compressor # (this happens during template rendering) shutil.copytree(os.path.join(base_path, 'static'), os.path.join(tmp_dir, 'static')) ending = datetime.now() duration = int((ending - beginning).total_seconds()) logger.info("Total number of videos found: {}".format( copy_media.videos_found)) logger.info("Total number of topic pages created: {}".format( render_topic_pages.pages_rendered)) logger.info("Invoking zimwriterfs, writing to: {}".format(dest_file)) zimwriterfs_args = ( zimwriterfs, "--welcome", "welcome.html", "--favicon", "static/img/ka_leaf.png", "--publisher", publisher, "--creator", "KhanAcademy.org", "--description", "Khan Academy ({})".format(language), "--description", "Videos from Khan Academy", "--language", language, tmp_dir, dest_file, ) process = subprocess.Popen(zimwriterfs_args, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking zimwriterfs: {}").format( _stderr_data + stdout_data) logger.info("Duration: {h:} hours, {m:} minutes, {s:} seconds".format( h=duration // 3600, m=(duration % 3600) // 60, s=duration % 60, ))
def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning( "Found a duplicate ID for {}, re-downloading".format( node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False subtitle_dir = os.path.join(tmp_dir, "subtitle") if not os.path.exists(subtitle_dir): os.makedirs(subtitle_dir) videos_dir = os.path.join(tmp_dir, "videos") if not os.path.exists(videos_dir): os.makedirs(videos_dir) if transcode2webm: video_file_name = node['id'] + '.' + 'webm' else: video_file_name = node['id'] + '.' + node['content'][ 'format'] video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(videos_dir, video_file_name) thumb_dir = os.path.join(tmp_dir, "thumb") if not os.path.exists(thumb_dir): os.makedirs(thumb_dir) thumb_file_name = node['id'] + '.png' thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(thumb_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format( video_file_src)) if transcode2webm: yt_video_url = YOUTUBE_URL.format( id=node['content']['youtube_id']) cmd = [ 'youtube-dl', '-o', video_file_src, '-f', 'webm/mp4', '--recode-video', 'webm', '-k', yt_video_url ] process = subprocess.Popen(cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking ffmpeg: {}".format( (_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format( " ".join(cmd))) raise CommandError( "Could not complete transcoding") node['content']['format'] = "webm" else: download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if not os.path.exists(video_file_dest): copy_file(video_file_src, video_file_dest) node["video_url"] = os.path.join("videos", video_file_name) copy_media.videos_found += 1 logger.info("Videos processed: {}".format( copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error( "Failed to create thumbnail for {}".format( video_file_src)) else: logger.info( "Successfully created thumbnail for {}".format( video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( "thumb", node['id'] + '.png') if not os.path.exists(thumb_file_dest): copy_file(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join(subtitle_src_dir, node['id'] + '.srt') if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join(subtitle_dir, node['id'] + '.vtt') # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format( subtitle_srt)) else: logger.info( "Subtitle convert from SRT to VTT: {}".format( subtitle_vtt)) node["subtitle_url"] = os.path.join( "subtitle", node['id'] + '.vtt') else: if options['download']: logger.error("File not found or downloaded: {}".format( video_file_src)) else: logger.error("Invalid node, kind: {}".format( node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get( "children", []) unavailable_video = child["kind"] == "Video" and not child.get( "content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children