예제 #1
0
def get_exercise_page_paths(video_id=None, video_slug=None):
    assert (video_id or video_slug) and not (video_id and video_slug), "One arg, not two" 

    try:
        exercise_paths = set()
        for exercise in get_related_exercises(video=topic_tools.get_node_cache("Video")[video_slug]):
            exercise_paths = exercise_paths.union(set(exercise["paths"]))
        return list(exercise_paths)
    except:
        return []
예제 #2
0
def get_video_page_paths(video_id=None, video_slug=None):
    assert (video_id or video_slug) and not (video_id and video_slug), "One arg, not two" 

    try:
        if not video_slug:
            video_slug = topic_tools.get_id2slug_map()[video_id]
        
        return topic_tools.get_node_cache("Video")[video_slug]['paths']
    except:
        return []
예제 #3
0
def create_youtube_id_to_slug_map(node_cache=None, data_path=settings.PROJECT_PATH + "/static/data/"):
    """
    Go through all videos, and make a map of youtube_id to slug, for fast look-up later
    """

    if not node_cache:
        node_cache = topic_tools.get_node_cache(force=True)

    map_file = os.path.join(data_path, topic_tools.video_remap_file)
    id2slug_map = dict()

    # Make a map from youtube ID to video slug
    for v in node_cache['Video'].values():
        assert v["youtube_id"] not in id2slug_map, "Make sure there's a 1-to-1 mapping between youtube_id and slug"
        id2slug_map[v['youtube_id']] = v['slug']

    # Save the map!
    with open(map_file, "w") as fp:
        fp.write(json.dumps(id2slug_map, indent=2))
예제 #4
0
def create_youtube_id_to_slug_map(node_cache=None,
                                  data_path=settings.PROJECT_PATH +
                                  "/static/data/"):
    """
    Go through all videos, and make a map of youtube_id to slug, for fast look-up later
    """

    if not node_cache:
        node_cache = topic_tools.get_node_cache(force=True)

    map_file = os.path.join(data_path, topic_tools.video_remap_file)
    id2slug_map = dict()

    # Make a map from youtube ID to video slug
    for v in node_cache['Video'].values():
        assert v[
            "youtube_id"] not in id2slug_map, "Make sure there's a 1-to-1 mapping between youtube_id and slug"
        id2slug_map[v['youtube_id']] = v['slug']

    # Save the map!
    with open(map_file, "w") as fp:
        fp.write(json.dumps(id2slug_map, indent=2))
예제 #5
0
def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None):
    """Write or update JSON file that maps from YouTube ID to Amara code and languages available"""
    videos = get_node_cache("Video")

    # Initialize the data
    out_file = settings.SUBTITLES_DATA_ROOT + SRTS_JSON_FILENAME

    if not os.path.exists(out_file):
        srts_dict = {}
    else:
        # Open the file, read, and clean out old videos.
        try:
            with open(out_file, "r") as fp:
                srts_dict = json.load(fp)
        except Exception as e:
            logging.debug("JSON file corrupted, using empty json and starting from scratch.\n%s" % e)
            srts_dict = {}
        else:
            logging.info("Loaded %d mappings." % (len(srts_dict)))

        # Set of videos no longer used by KA Lite
        removed_videos = set(srts_dict.keys()) - set([v["youtube_id"] for v in videos.values()])
        if removed_videos:
            logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos))
            for vid in removed_videos:
                del srts_dict[vid]
    logging.info("Querying %d mappings." % (len(videos) - (0 if (force or date_to_check) else len(srts_dict))))

    # Once we have the current mapping, proceed through logic to update the mapping
    n_new_entries = 0
    n_failures = 0
    for video, data in videos.iteritems():
        # Decide whether or not to update this video based on the arguments provided at the command line
        youtube_id = data["youtube_id"]
        cached = youtube_id in srts_dict
        if not force and cached:
            # First, check against date
            flag_for_refresh = True  # not (response_code or last_attempt)
            last_attempt = srts_dict[youtube_id].get("last_attempt")
            last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, "%Y-%m-%d")
            flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt)
            if not flag_for_refresh:
                logging.debug("Skipping %s for date-check" % youtube_id)
                continue
            # Second, check against response code
            response_code = srts_dict[youtube_id].get("api_response")
            flag_for_refresh = flag_for_refresh and (
                not response_to_check or response_to_check == "all" or response_to_check == response_code
            )
            if not (flag_for_refresh):
                logging.debug("Skipping %s for response-code" % youtube_id)
                continue
            if not response_to_check and not date_to_check and cached:  # no flags specified and already cached - skip
                logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id)
                continue
        else:
            if force and not cached:
                logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id)
            elif force and cached:
                logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id)
            else:
                logging.debug("Updating %s because video not yet cached." % youtube_id)

        # If it makes it to here without hitting a continue, then update the entry
        try:
            srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {}))
        except Exception as e:
            logging.warn("Error updating video %s: %s" % (youtube_id, e))
            n_failures += 1
            continue

        if n_new_entries % frequency_to_save == 0:
            logging.info("On loop %d dumping dictionary into %s" % (n_new_entries, out_file))
            with open(out_file, "wb") as fp:
                json.dump(srts_dict, fp)
        n_new_entries += 1

    # Finished the loop: save and report
    with open(out_file, "wb") as fp:
        json.dump(srts_dict, fp)
    if n_failures == 0:
        logging.info("Great success! Stored %d fresh entries, %d total." % (n_new_entries, len(srts_dict)))
    else:
        logging.warn("Stored %s fresh entries, but with %s failures." % (n_new_entries, n_failures))
예제 #6
0
파일: topicdata.py 프로젝트: mjptak/ka-lite
import json
import os

import settings
from utils import topic_tools


TOPICS          = topic_tools.get_topic_tree()
NODE_CACHE      = topic_tools.get_node_cache()
ID2SLUG_MAP     = topic_tools.get_id2slug_map()
예제 #7
0
import json
import os

import settings
from utils import topic_tools

TOPICS = topic_tools.get_topic_tree()
NODE_CACHE = topic_tools.get_node_cache()
EXERCISE_TOPICS = topic_tools.get_exercise_topics()
ID2SLUG_MAP = topic_tools.get_id2slug_map()
LANGUAGE_LOOKUP = json.loads(
    open(os.path.join(settings.DATA_PATH, "languages.json")).read())
LANGUAGE_LIST = json.loads(
    open(os.path.join(settings.SUBTITLES_DATA_ROOT,
                      "listedlanguages.json")).read())
예제 #8
0
def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None):
    """Write or update JSON file that maps from YouTube ID to Amara code and languages available"""
    videos = get_node_cache('Video')

    # Initialize the data
    out_file = settings.SUBTITLES_DATA_ROOT + SRTS_JSON_FILENAME

    if not os.path.exists(out_file):
        srts_dict = {}
    else:
        # Open the file, read, and clean out old videos.
        try:
            with open(out_file, "r") as fp:
                srts_dict = json.load(fp)
        except Exception as e:
            logging.debug("JSON file corrupted, using empty json and starting from scratch.\n%s" % e)
            srts_dict = {}
        else:
            logging.info("Loaded %d mappings." % (len(srts_dict)))

        # Set of videos no longer used by KA Lite
        removed_videos = set(srts_dict.keys()) - set([v["youtube_id"] for v in videos.values()])
        if removed_videos:
            logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos))
            for vid in removed_videos:
                del srts_dict[vid]
    logging.info("Querying %d mappings." % (len(videos) - (0 if (force or date_to_check) else len(srts_dict))))

    # Once we have the current mapping, proceed through logic to update the mapping
    n_new_entries = 0
    n_failures = 0
    for video, data in videos.iteritems():
        # Decide whether or not to update this video based on the arguments provided at the command line 
        youtube_id = data['youtube_id']
        cached = youtube_id in srts_dict
        if not force and cached: 
            # First, check against date
            flag_for_refresh = True # not (response_code or last_attempt)
            last_attempt = srts_dict[youtube_id].get("last_attempt")
            last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, '%Y-%m-%d')
            flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt)
            if not flag_for_refresh:
                logging.debug("Skipping %s for date-check" % youtube_id)
                continue
            # Second, check against response code
            response_code = srts_dict[youtube_id].get("api_response")
            flag_for_refresh = flag_for_refresh and (not response_to_check or response_to_check == "all" or response_to_check == response_code)
            if not (flag_for_refresh):
                logging.debug("Skipping %s for response-code" % youtube_id)
                continue
            if not response_to_check and not date_to_check and cached: # no flags specified and already cached - skip
                logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id)
                continue
        else:
            if force and not cached:
                logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id)
            elif force and cached:
                logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id)
            else: 
                logging.debug("Updating %s because video not yet cached." % youtube_id)

        # If it makes it to here without hitting a continue, then update the entry
        try:
            srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {}))
        except Exception as e:
            logging.warn("Error updating video %s: %s" % (youtube_id, e))
            n_failures += 1
            continue

        if n_new_entries % frequency_to_save == 0:
            logging.info("On loop %d dumping dictionary into %s" %(n_new_entries, out_file))
            with open(out_file, 'wb') as fp:
                json.dump(srts_dict, fp)
        n_new_entries += 1

    # Finished the loop: save and report
    with open(out_file, 'wb') as fp:
        json.dump(srts_dict, fp)
    if n_failures == 0:
        logging.info("Great success! Stored %d fresh entries, %d total." % (n_new_entries, len(srts_dict)))
    else:
        logging.warn("Stored %s fresh entries, but with %s failures." % (n_new_entries, n_failures))
예제 #9
0
def rebuild_topictree(data_path=settings.PROJECT_PATH + "/static/data/", remove_unknown_exercises=False):
    """
    Downloads topictree (and supporting) data from Khan Academy and uses it to
    rebuild the KA Lite topictree cache (topics.json).
    """

    topictree = download_khan_data("http://www.khanacademy.org/api/v1/topictree")

    related_exercise = {}  # Temp variable to save exercises related to particular videos

    def recurse_nodes(node, path=""):
        """
        Internal function for recursing over the topic tree, marking relevant metadata,
        and removing undesired attributes and children.
        """
        
        kind = node["kind"]

        # Only keep key data we can use
        for key in node.keys():
            if key not in attribute_whitelists[kind]:
                del node[key]

        # Fix up data
        if slug_key[kind] not in node:
            logging.warn("Could not find expected slug key (%s) on node: %s" % (slug_key[kind], node))
            node[slug_key[kind]] = node["id"]  # put it SOMEWHERE.
        node["slug"] = node[slug_key[kind]] if node[slug_key[kind]] != "root" else ""
        node["id"] = node["slug"]  # these used to be the same; now not. Easier if they stay the same (issue #233)

        node["path"] = path + topic_tools.kind_slugs[kind] + node["slug"] + "/"
        node["title"] = node[title_key[kind]]


        kinds = set([kind])

        # For each exercise, need to get related videos
        if kind == "Exercise":
            related_video_readable_ids = [vid["readable_id"] for vid in download_khan_data("http://www.khanacademy.org/api/v1/exercises/%s/videos" % node["name"], node["name"] + ".json")]
            node["related_video_readable_ids"] = related_video_readable_ids
            exercise = {
                "slug": node[slug_key[kind]],
                "title": node[title_key[kind]],
                "path": node["path"],
            }
            for video_id in node.get("related_video_readable_ids", []):
                related_exercise[video_id] = exercise

        # Recurse through children, remove any blacklisted items
        children_to_delete = []
        for i, child in enumerate(node.get("children", [])):
            child_kind = child.get("kind", None)
            if child_kind in kind_blacklist:
                children_to_delete.append(i)
                continue
            if child[slug_key[child_kind]] in slug_blacklist:
                children_to_delete.append(i)
                continue
            kinds = kinds.union(recurse_nodes(child, node["path"]))
        for i in reversed(children_to_delete):
            del node["children"][i]

        # Mark on topics whether they contain Videos, Exercises, or both
        if kind == "Topic":
            node["contains"] = list(kinds)

        return kinds
    recurse_nodes(topictree)


    # Limit exercises to only the previous list
    def recurse_nodes_to_delete_exercise(node, OLD_NODE_CACHE):
        """
        Internal function for recursing the topic tree and removing new exercises.
        Requires rebranding of metadata done by recurse_nodes function.
        """
        # Stop recursing when we hit leaves
        if node["kind"] != "Topic":
            return

        children_to_delete = []
        for ci, child in enumerate(node.get("children", [])):
            # Mark all unrecognized exercises for deletion
            if child["kind"] == "Exercise":
                if not child["slug"] in OLD_NODE_CACHE["Exercise"].keys():
                    children_to_delete.append(ci)
            # Recurse over children to delete
            elif child.get("children", None):
                recurse_nodes_to_delete_exercise(child, OLD_NODE_CACHE)
                # Delete children without children (all their children were removed)
                if not child.get("children", None):
                    logging.debug("Removing now-childless topic node '%s'" % child["slug"])
                    children_to_delete.append(ci)
                # If there are no longer exercises, be honest about it
                elif not any([ch["kind"] == "Exercise" or "Exercise" in ch.get("contains", []) for ch in child["children"]]):
                    child["contains"] = list(set(child["contains"]) - set(["Exercise"]))

        # Do the actual deletion
        for i in reversed(children_to_delete):
            logging.debug("Deleting unknown exercise %s" % node["children"][i]["slug"])
            del node["children"][i]
    if remove_unknown_exercises:
        OLD_NODE_CACHE = topic_tools.get_node_cache()
        recurse_nodes_to_delete_exercise(topictree, OLD_NODE_CACHE) # do this before [add related]
        for vid, ex in related_exercise.items():
            if ex and ex["slug"] not in OLD_NODE_CACHE["Exercise"].keys():
                related_exercise[vid] = None

    def recurse_nodes_to_add_related_exercise(node):
        """
        Internal function for recursing the topic tree and marking related exercises.
        Requires rebranding of metadata done by recurse_nodes function.
        """
        if node["kind"] == "Video":
            node["related_exercise"] = related_exercise.get(node["slug"], None)
        for child in node.get("children", []):
            recurse_nodes_to_add_related_exercise(child)
    recurse_nodes_to_add_related_exercise(topictree)

    def recurse_nodes_to_remove_childless_nodes(node):
        """
        When we remove exercises, we remove dead-end topics.
        Khan just sends us dead-end topics, too.
        Let's remove those too.
        """
        children_to_delete = []
        for ci, child in enumerate(node.get("children", [])):
            # Mark all unrecognized exercises for deletion
            if child["kind"] != "Topic":
                continue

            recurse_nodes_to_remove_childless_nodes(child)

            if not child.get("children"):
                children_to_delete.append(ci)
                logging.debug("Removing KA childless topic: %s" % child["slug"])

        for ci in reversed(children_to_delete):
            del node["children"][ci]
    recurse_nodes_to_remove_childless_nodes(topictree)


        # Do the actual deletion
    with open(os.path.join(data_path, topic_tools.topics_file), "w") as fp:
        fp.write(json.dumps(topictree, indent=2))

    return topictree