def make_language_pack(lang, version, sublangargs, filename, no_assessment_items, no_subtitles, no_assessment_resources): node_data, subtitle_data, interface_catalog, content_catalog = retrieve_language_resources(version, sublangargs, no_subtitles) subtitles, subtitle_paths = subtitle_data.keys(), subtitle_data.values() node_data = translate_nodes(node_data, content_catalog) node_data = list(node_data) node_data, dubbed_video_count = apply_dubbed_video_map(node_data, subtitles, sublangargs["video_lang"]) html_exercise_ids, assessment_exercise_ids, node_data = separate_exercise_types(node_data) html_exercise_path, translated_html_exercise_ids = retrieve_html_exercises(html_exercise_ids, lang) # now include only the assessment item resources that we need all_assessment_data, all_assessment_files = retrieve_all_assessment_item_data( no_item_data=no_assessment_items, no_item_resources=no_assessment_resources ) assessment_data = list(translate_assessment_item_text(all_assessment_data, content_catalog)) if lang != "en" else all_assessment_data node_data = remove_untranslated_exercises(node_data, translated_html_exercise_ids, assessment_data) if lang != "en" else node_data pack_metadata = generate_kalite_language_pack_metadata(lang, version, interface_catalog, content_catalog, subtitles, dubbed_video_count) bundle_language_pack(str(filename), node_data, interface_catalog, interface_catalog, pack_metadata, assessment_data, all_assessment_files, subtitle_paths, html_exercise_path)
def retrieve_assessment_item_data(assessment_item, lang=None, force=False, no_item_data=False, no_item_resources=False, content_catalog=None) -> (dict, [str]): """ Retrieve assessment item data and images for a single assessment item. :param assessment_item: id of assessment item :param lang: language to retrieve data in :param force: refetch assessment item and images even if it exists on disk :return: tuple of dict of assessment item data and list of paths to files """ if no_item_data: return {}, [] if lang: url = "http://www.khanacademy.org/api/v1/assessment_items/{assessment_item}?lang={lang}".format(lang=lang, assessment_item=assessment_item) filename = "assessment_items/{assessment_item}_{lang}.json".format(lang=lang, assessment_item=assessment_item) else: url = "http://www.khanacademy.org/api/v1/assessment_items/{assessment_item}" filename = "assessment_items/{assessment_item}.json" try: url = url.format(assessment_item=assessment_item) filename = filename.format(assessment_item=assessment_item) path = download_assessment_item_data(url, filename=filename, lang=lang, force=force) except requests.RequestException: logging.error("Download failure for assessment item: {assessment_item}".format(assessment_item=assessment_item)) raise with open(path, "r") as f: item_data = json.load(f) # TEMP HACK: translate the item text here before URLs are localized, because otherwise, later, Crowdin strings no longer match if lang != "en" and content_catalog is not None: item_data = list(translate_assessment_item_text([item_data], content_catalog)) if item_data: item_data = item_data[0] else: # if no translation, return empty assessment_item return {}, [] image_urls = find_all_image_urls(item_data) graphie_urls = find_all_graphie_urls(item_data) urls = list(itertools.chain(image_urls, graphie_urls)) def _download_image_urls(url): filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get(url, os.path.basename(url)) filepath = _get_subpath_from_filename(filename) return download_and_cache_file(url, filename=filepath) file_paths = [] if no_item_resources else list(map(_download_image_urls, urls)) item_data = localize_image_urls(item_data) item_data = localize_content_links(item_data) item_data = localize_graphie_urls(item_data) # Validate assessment item content. for k, v in ujson.loads(item_data["item_data"]).items(): if k == "question": if not v.get("content"): logging.info("Found empty assessment content from KA's API {assessment_item}".format(assessment_item=assessment_item)) return {}, [] return item_data, file_paths
def test_doesnt_returns_all_items(self): catalog = generate_catalog() sample_data = [ {"id": "not_in_catalog", "item_data": '"wala ito sa catalog"'}, {"id": "not_translated", "item_data": '"Heart failure"'}, {"id": "translated", "item_data": '"Millions"'}, ] translated = [node.get("id") for node in translate_assessment_item_text(sample_data, catalog)] assert "translated" in translated assert "not_in_catalog" in translated assert "not_translated" in translated
def make_language_pack(lang, version, sublangargs, filename, ka_domain, no_assessment_items, no_subtitles, no_assessment_resources, no_dubbed_videos): node_data, subtitle_data, interface_catalog, content_catalog = retrieve_language_resources( version, sublangargs, ka_domain, no_subtitles, no_dubbed_videos) subtitles, subtitle_paths = subtitle_data.keys(), subtitle_data.values() node_data = translate_nodes(node_data, content_catalog) node_data = list(node_data) node_data, dubbed_video_count = apply_dubbed_video_map( node_data, subtitles, sublangargs["video_lang"]) html_exercise_ids, assessment_exercise_ids, node_data = separate_exercise_types( node_data) html_exercise_path, translated_html_exercise_ids = retrieve_html_exercises( html_exercise_ids, lang) # now include only the assessment item resources that we need all_assessment_data, all_assessment_files = retrieve_all_assessment_item_data( no_item_data=no_assessment_items, no_item_resources=no_assessment_resources, node_data=node_data, lang=lang, ) all_assessment_data = list( remove_assessment_data_with_empty_widgets(all_assessment_data)) node_data = remove_nonexistent_assessment_items_from_exercises( node_data, all_assessment_data) node_data = clean_node_data_items(node_data) assessment_data = list( translate_assessment_item_text( all_assessment_data, content_catalog)) if lang != "en" else all_assessment_data node_data = remove_untranslated_exercises( node_data, translated_html_exercise_ids, assessment_data) if lang != "en" else node_data pack_metadata = generate_kalite_language_pack_metadata( lang, version, interface_catalog, content_catalog, subtitles, dubbed_video_count) bundle_language_pack(str(filename), node_data, interface_catalog, interface_catalog, pack_metadata, assessment_data, all_assessment_files, subtitle_paths, html_exercise_path)
def make_language_pack(lang, version, sublangargs, filename, ka_domain, no_assessment_items, no_subtitles, no_assessment_resources, no_dubbed_videos): node_data, subtitle_data, content_catalog = retrieve_language_resources( version, sublangargs, ka_domain, no_subtitles, no_dubbed_videos) subtitles, subtitle_paths = subtitle_data.keys(), subtitle_data.values() node_data = translate_nodes(node_data, content_catalog) node_data = list(node_data) node_data, dubbed_video_count = apply_dubbed_video_map( node_data, subtitles, sublangargs["video_lang"]) html_exercise_ids, assessment_exercise_ids, node_data = separate_exercise_types( node_data) html_exercise_path, translated_html_exercise_ids = retrieve_html_exercises( html_exercise_ids, lang) # now include only the assessment item resources that we need all_assessment_data, all_assessment_files = retrieve_all_assessment_item_data( no_item_data=no_assessment_items, no_item_resources=no_assessment_resources, node_data=node_data, lang=lang, ) all_assessment_data = list( remove_assessment_data_with_empty_widgets(all_assessment_data)) node_data = remove_nonexistent_assessment_items_from_exercises( node_data, all_assessment_data) node_data = clean_node_data_items(node_data) assessment_data = list( translate_assessment_item_text( all_assessment_data, content_catalog)) if lang != "en" else all_assessment_data node_data = remove_untranslated_exercises( node_data, translated_html_exercise_ids, assessment_data) if lang != "en" else node_data node_data = list(node_data) node_data = sorted(node_data, key=lambda x: x.get('sort_order')) with open('node_data_{0}.pickle'.format(lang), 'wb') as handle: pickle.dump(node_data, handle) with open('assessment_data_{0}.pickle'.format(lang), 'wb') as handle: pickle.dump(assessment_data, handle)
def test_doesnt_return_untranslated_items(self): catalog = generate_catalog() sample_data = { "not_in_catalog": { "item_data": '"wala ito sa catalog"' }, "not_translated": { "item_data": '"Heart failure"' }, "translated": { "item_data": '"Millions"' } } translated = [id for id, _ in translate_assessment_item_text(sample_data, catalog)] assert "translated" in translated assert "not_in_catalog" not in translated assert "not_translated" not in translated
def test_doesnt_returns_all_items(self): catalog = generate_catalog() sample_data = [{ "id": "not_in_catalog", "item_data": '"wala ito sa catalog"' }, { "id": "not_translated", "item_data": '"Heart failure"' }, { "id": "translated", "item_data": '"Millions"' }] translated = [ node.get("id") for node in translate_assessment_item_text(sample_data, catalog) ] assert "translated" in translated assert "not_in_catalog" in translated assert "not_translated" in translated