def get_khan_topic_tree(lang="en", curr_key=None): if lang == "sw": response = make_request(V2_API_URL.format(lang="swa", projection=PROJECTION_KEYS), timeout=120) else: response = make_request(V2_API_URL.format(lang=lang, projection=PROJECTION_KEYS), timeout=120) topic_tree = ujson.loads(response.content) # if name of lang is passed in, get language code if getlang_by_name(lang): lang = getlang_by_name(lang).primary_code if lang not in SUPPORTED_LANGS: global translations translations = retrieve_translations(lang_code=lang) # Flatten node_data flattened_tree = [ node for node_list in topic_tree.values() for node in node_list ] # convert to dict with ids as keys tree_dict = {node["id"]: node for node in flattened_tree} return _recurse_create(tree_dict["x00000000"], tree_dict, lang=lang)
def generate_dubbed_video_mappings_from_csv(): resp = make_request(KA_LITE_DUBBED_LIST, timeout=120) csv_data = resp.content.decode("utf-8") # This CSV file is in standard format: separated by ",", quoted by '"' reader = csv.reader(StringIO(csv_data)) video_map = {} header_row = [] # Loop through each row in the spreadsheet. for row in reader: # skip over the header rows if row[0].strip() in ["", "UPDATED:"]: continue elif row[0] == "SERIAL": # Read the header row. header_row = [ v.lower() for v in row ] # lcase all header row values (including language names) slug_idx = header_row.index("title id") english_idx = header_row.index("english") assert slug_idx != -1, "Video slug column header should be found." assert english_idx != -1, "English video column header should be found." else: # Rows 6 and beyond are data. assert len(row) == len( header_row), "Values line length equals headers line length" # Grab the slug and english video ID. video_slug = row[slug_idx] english_video_id = row[english_idx] assert english_video_id, "English Video ID should not be empty" assert video_slug, "Slug should not be empty" # English video is the first video ID column, # and following columns (until the end) are other languages. # Loop through those columns and, if a video exists, # add it to the dictionary. for idx in range(english_idx, len(row)): if not row[idx]: # make sure there's a dubbed video continue lang = header_row[idx] if lang not in video_map: # add the first level if it doesn't exist video_map[lang] = {} dubbed_youtube_id = row[idx] if english_video_id == dubbed_youtube_id and lang != "english": print( "Removing entry for (%s, %s): dubbed and english youtube ID are the same." % (lang, english_video_id)) else: video_map[lang][english_video_id] = row[ idx] # add the corresponding video id for the video, in this language. return video_map
def get_assessment_items(self): items_list = [] kalang = ASSESSMENT_LANGUAGE_MAPPING.get(self.lang, self.lang) for ai_id in self.assessment_items: item_url = ASSESSMENT_URL.format(assessment_item=ai_id, kalang=kalang) item = make_request(item_url).json() # check if assessment item is fully translated, before adding it to list if item["isFullyTranslated"]: ai = KhanAssessmentItem(item["id"], item["itemData"], self.source_url) items_list.append(ai) return items_list
def get_video_id_english_mappings(): projection = json.dumps( {"videos": [OrderedDict([("youtubeId", 1), ("id", 1)])]}) r = make_request(V2_API_URL.format(lang='en', projection=projection), timeout=120) english_video_data = r.json() english_video_data = english_video_data["videos"] mapping = {n["id"]: n["youtubeId"] for n in english_video_data} return mapping
def get_assessment_items(self): items_list = [] lang = ASSESSMENT_LANGUAGE_MAPPING.get(self.lang, self.lang) for i in self.assessment_items: item_url = ASSESSMENT_URL.format(assessment_item=i["id"], lang=lang) item = make_request(item_url).json() # check if assessment item is fully translated, before adding it to list if item["is_fully_translated"]: items_list.append( KhanAssessmentItem(item["id"], item["item_data"], self.source_url)) return items_list
def retrieve_translations(lang_code, includes="*.po"): if lang_code in SUPPORTED_LANGS: return {} lang_code = CROWDIN_LANGUAGE_MAPPING.get(lang_code, lang_code) r = make_request(CROWDIN_URL.format(key=os.environ['KA_CROWDIN_SECRET_KEY'], lang_code=lang_code), timeout=180) with open('crowdin.zip', "wb") as f: for chunk in r.iter_content(1024): f.write(chunk) zip_extraction_path = tempfile.mkdtemp() with zipfile.ZipFile('crowdin.zip') as zf: zf.extractall(zip_extraction_path) all_filenames = glob.iglob( os.path.join(zip_extraction_path, "**"), recursive=True ) filenames = fnmatch.filter(all_filenames, includes) # use the polib library, since it's much faster at concatenating # po files. it doesn't have a dict interface though, so we'll # reread the file using babel.Catalog. with tempfile.NamedTemporaryFile() as f: main_pofile = polib.POFile(fpath=f.name) for filename in filenames: pofile = polib.pofile(filename) main_pofile.merge(pofile) for entry in main_pofile: entry.obsolete = False main_pofile.save() shutil.rmtree(zip_extraction_path) msgid_mapping = Catalog(main_pofile) return msgid_mapping
def get_khan_api_json(lang, update=False): """ Get all data for language `lang` from the KA API at /api/v2/topics/topictree """ filename = 'khan_academy_json_{}.json'.format(lang) filepath = os.path.join(KHAN_API_CACHE_DIR, filename) if os.path.exists(filepath) and not update: print('Loaded KA API json from cache', filepath) data = json.load(open(filepath)) else: print('Downloading KA API json for lang =', lang) url = V2_API_URL.format(lang=lang, projection=PROJECTION_KEYS) LOGGER.debug('khan API url=' + url) response = make_request(url, timeout=120) data = response.json() if not os.path.exists(KHAN_API_CACHE_DIR): os.makedirs(KHAN_API_CACHE_DIR, exist_ok=True) json.dump(data, open(filepath, 'w'), ensure_ascii=False, indent=4) return data
def generate_common_core_mapping(): resp = make_request(COMMON_CORE_SPREADSHEET, timeout=120) csv_data = resp.content.decode("utf-8") # This CSV file is in standard format: separated by ",", quoted by '"' reader = csv.reader(StringIO(csv_data)) slug_standard_map = {} header_row = [] # Loop through each row in the spreadsheet. for row in reader: if row[0] == "Grade": # Read the header row. header_row = [v.lower() for v in row] # lcase all header row values grade_idx = header_row.index("grade") common_core_idx = header_row.index("common core area") standard_idx = header_row.index("standard") skill_name_idx = header_row.index("name of skill on khan academy") link_idx = header_row.index("link to skill") description_idx = header_row.index("description") area_idx = header_row.index("area") else: # Grab CC standard and link to exercise standard_tag = row[standard_idx] link = row[link_idx] if not link or not standard_tag: continue # parse out slug from link and set standard tag slug = link.split("e/")[1] slug_standard_map[slug] = standard_tag return slug_standard_map