Exemplo n.º 1
0
 def _download_html_exercise(exercise_id):
     """
     Download an exercise and return its exercise id *if* the
     downloaded url from the selected language is different from the english version.
     """
     try:
         for lang in lang_codes:
             lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(
                 id=exercise_id, lang=lang)
             en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(
                 id=exercise_id, lang=EN_LANG_CODE)
             try:
                 lang_file = download_and_cache_file(lang_url,
                                                     cachedir=BUILD_DIR,
                                                     ignorecache=force)
                 en_file = download_and_cache_file(en_url,
                                                   cachedir=EN_BUILD_DIR,
                                                   ignorecache=force)
                 if not filecmp.cmp(lang_file, en_file, shallow=False):
                     return exercise_id
             except requests.exceptions.HTTPError as e:
                 logging.warning(
                     "Failed to fetch html for lang: {}, exercise {}, exception: {}"
                     .format(lang, exercise_id, e))
     except requests.exceptions.HTTPError as e:
         logging.warning(
             "Failed to fetch exercise for lang_codes: {}, exception: {}".
             format(lang_codes, e))
         return None
Exemplo n.º 2
0
def retrieve_subtitles(videos: list, lang="en", force=False) -> list:
    #videos => contains list of youtube ids
    """return list of youtubeids that were downloaded"""
    downloaded_videos = []
    not_downloaded_videos = []
    for youtube_id in videos:
        request_url = "https://www.amara.org/api2/partners/videos/?format=json&video_url=http://www.youtube.com/watch?v=%s" % (
        youtube_id)
        
        try:
            response =  requests.get(request_url)
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            print("Skipping {}".format(youtube_id))
            continue

        content = ujson.loads(response.content)
        if not content["objects"]:
            not_downloaded_videos.append(youtube_id)
            continue
        else:
            amara_id = content["objects"][0]["id"]
            subtitle_download_uri = "https://www.amara.org/api/videos/%s/languages/%s/subtitles/?format=vtt" %(amara_id, lang)
            try:
                response_code = urllib.request.urlopen(subtitle_download_uri)

            except urllib.error.HTTPError:
                continue   
            file_dir = os.path.join(os.getcwd(), "build", "subtitles", lang)
            filename = "{}.vtt".format(youtube_id)
            download_and_cache_file(subtitle_download_uri, file_dir, filename=filename, ignorecache=force)
            downloaded_videos.append(youtube_id)
    
    return downloaded_videos
Exemplo n.º 3
0
 def _download_html_exercise(exercise_id):
     """
     Download an exercise and return its exercise id *if* the
     downloaded url from the selected language is different from the english version.
     """
     lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=lang)
     en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang="en")
     try:
         lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force)
         en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force)
         if not filecmp.cmp(lang_file, en_file, shallow=False):
             return exercise_id
     except urllib.error.HTTPError:
         return None
Exemplo n.º 4
0
 def _download_html_exercise(exercise_id):
     """
     Download an exercise and return its exercise id *if* the
     downloaded url from the selected language is different from the english version.
     """
     lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=lang)
     en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang="en")
     try:
         lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force)
         en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force)
         if not filecmp.cmp(lang_file, en_file, shallow=False):
             return exercise_id
     except requests.exceptions.HTTPError as e:
         logging.warning("Failed to fetch html for exercise {}, exception: {}".format(exercise_id, e))
         return None
Exemplo n.º 5
0
def retrieve_translations(crowdin_project_name,
                          crowdin_secret_key,
                          lang_code=EN_LANG_CODE,
                          force=False,
                          includes="*.po") -> Catalog:
    request_url_template = ("https://api.crowdin.com/api/"
                            "project/{project_id}/download/"
                            "{lang_code}.zip?key={key}")
    export_url_template = ("https://api.crowdin.com/api/"
                           "project/{project_id}/export/"
                           "{lang_code}.zip?key={key}")
    request_url = request_url_template.format(
        project_id=crowdin_project_name,
        lang_code=lang_code,
        key=crowdin_secret_key,
    )
    export_url = request_url_template.format(
        project_id=crowdin_project_name,
        lang_code=lang_code,
        key=crowdin_secret_key,
    )

    logging.info("requesting CrowdIn to rebuild latest translations.")
    try:
        requests.get(export_url)
    except requests.exceptions.RequestException as e:
        logging.warning(
            "Got exception when building CrowdIn translations: {}".format(e))

    logging.debug("Retrieving translations from {}".format(request_url))
    zip_path = download_and_cache_file(request_url, ignorecache=force)
    zip_extraction_path = tempfile.mkdtemp()

    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(zip_extraction_path)

    all_filenames = glob.iglob(os.path.join(zip_extraction_path, "**"),
                               recursive=True)
    filenames = fnmatch.filter(all_filenames, includes)

    # use the polib library, since it's much faster at concatenating
    # po files.  it doesn't have a dict interface though, so we'll
    # reread the file using babel.Catalog.
    with tempfile.NamedTemporaryFile() as f:
        main_pofile = polib.POFile(fpath=f.name)

        for filename in filenames:
            pofile = polib.pofile(filename)
            main_pofile.merge(pofile)

        for entry in main_pofile:
            entry.obsolete = False

        main_pofile.save()

    shutil.rmtree(zip_extraction_path)

    msgid_mapping = Catalog(main_pofile)

    return msgid_mapping
Exemplo n.º 6
0
    def _download_subtitle_data(youtube_id):

        logging.info("trying to download subtitle for %s" % youtube_id)
        request_url = "https://www.amara.org/api2/partners/videos/?format=json&video_url=http://www.youtube.com/watch?v=%s" % (
            youtube_id)

        try:
            amara_id_file = retrieve_subtitle_meta_data(
                request_url,
                filename="subtitles/meta_data/{youtube_id}".format(
                    youtube_id=youtube_id))
            with open(amara_id_file, 'r') as f:
                amara_id = f.read()
            subtitle_download_uri = "https://www.amara.org/api/videos/%s/languages/%s/subtitles/?format=vtt" % (
                amara_id, lang)
            filename = "subtitles/{lang}/{youtube_id}.vtt".format(
                lang=lang, youtube_id=youtube_id)
            subtitle_path = download_and_cache_file(subtitle_download_uri,
                                                    filename=filename,
                                                    ignorecache=False)
            logging.info("subtitle path: {}".format(subtitle_path))
            return youtube_id, subtitle_path
        except (requests.exceptions.RequestException, KeyError,
                urllib.error.HTTPError, urllib.error.URLError) as e:
            logging.info("got error while downloading subtitles: {}".format(e))
            pass
Exemplo n.º 7
0
    def _download_subtitle_data(youtube_id):

        logging.info("trying to download subtitle for %s" % youtube_id)
        request_url = (
            "https://www.amara.org/api2/partners/videos/?format=json&video_url=http://www.youtube.com/watch?v=%s"
            % (youtube_id)
        )

        try:
            amara_id_file = retrieve_subtitle_meta_data(
                request_url, filename="subtitles/meta_data/{youtube_id}".format(youtube_id=youtube_id)
            )
            with open(amara_id_file, "r") as f:
                amara_id = f.read()
            subtitle_download_uri = "https://www.amara.org/api/videos/%s/languages/%s/subtitles/?format=vtt" % (
                amara_id,
                lang,
            )
            filename = "subtitles/{lang}/{youtube_id}.vtt".format(lang=lang, youtube_id=youtube_id)
            subtitle_path = download_and_cache_file(subtitle_download_uri, filename=filename, ignorecache=False)
            logging.info("subtitle path: {}".format(subtitle_path))
            return youtube_id, subtitle_path
        except (requests.exceptions.RequestException, KeyError, urllib.error.HTTPError, urllib.error.URLError) as e:
            logging.info("got error while downloading subtitles: {}".format(e))
            pass
Exemplo n.º 8
0
def retrieve_translations(crowdin_project_name, crowdin_secret_key, lang_code=EN_LANG_CODE, force=False,
                          includes="*.po") -> Catalog:
    request_url_template = ("https://api.crowdin.com/api/"
                            "project/{project_id}/download/"
                            "{lang_code}.zip?key={key}")
    export_url_template = ("https://api.crowdin.com/api/"
                            "project/{project_id}/export/"
                            "{lang_code}.zip?key={key}")
    request_url = request_url_template.format(
        project_id=crowdin_project_name,
        lang_code=lang_code,
        key=crowdin_secret_key,
    )
    export_url = request_url_template.format(
        project_id=crowdin_project_name,
        lang_code=lang_code,
        key=crowdin_secret_key,
    )

    logging.info("requesting CrowdIn to rebuild latest translations.")
    try:
        requests.get(export_url)
    except requests.exceptions.RequestException as e:
        logging.warning(
            "Got exception when building CrowdIn translations: {}".format(e)
        )

    logging.debug("Retrieving translations from {}".format(request_url))
    zip_path = download_and_cache_file(request_url, ignorecache=force)
    zip_extraction_path = tempfile.mkdtemp()

    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(zip_extraction_path)

    all_filenames = glob.iglob(
        os.path.join(zip_extraction_path, "**"),
        recursive=True
    )
    filenames = fnmatch.filter(all_filenames, includes)

    # use the polib library, since it's much faster at concatenating
    # po files.  it doesn't have a dict interface though, so we'll
    # reread the file using babel.Catalog.
    with tempfile.NamedTemporaryFile() as f:
        main_pofile = polib.POFile(fpath=f.name)

        for filename in filenames:
            pofile = polib.pofile(filename)
            main_pofile.merge(pofile)

        for entry in main_pofile:
            entry.obsolete = False

        main_pofile.save()

    shutil.rmtree(zip_extraction_path)

    msgid_mapping = Catalog(main_pofile)

    return msgid_mapping
Exemplo n.º 9
0
def retrieve_kalite_topic_data(url=None, force=False):
    """
    Retrieve the KA Lite topics.json file in the master branch.  If
    url is given, download from that url instead.
    """
    if not url:
        url = "https://raw.githubusercontent.com/learningequality/ka-lite/master/data/khan/topics.json"

    path = download_and_cache_file(url, ignorecache=force)
    with open(path) as f:
        return ujson.load(f)
Exemplo n.º 10
0
def retrieve_translations(crowdin_project_name, crowdin_secret_key, lang_code="en", force=False, includes="*.po") -> polib.POFile:

    request_url_template = ("https://api.crowdin.com/api/"
                            "project/{project_id}/download/"
                            "{lang_code}.zip?key={key}")
    request_url = request_url_template.format(
        project_id=crowdin_project_name,
        lang_code=lang_code,
        key=crowdin_secret_key,
    )

    zip_path = download_and_cache_file(request_url, ignorecache=force)
    zip_extraction_path = tempfile.mkdtemp()

    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(zip_extraction_path)

    all_filenames = glob.iglob(
        os.path.join(zip_extraction_path, "**"),
        recursive=True
    )
    filenames = fnmatch.filter(all_filenames, includes)

    # use the polib library, since it's much faster at concatenating
    # po files.  it doesn't have a dict interface though, so we'll
    # reread the file using babel.Catalog.
    with tempfile.NamedTemporaryFile() as f:
        main_pofile = polib.POFile(fpath=f.name)

        for filename in filenames:
            pofile = polib.pofile(filename)
            main_pofile.merge(pofile)

        for entry in main_pofile:
            entry.obsolete = False

        main_pofile.save()

    shutil.rmtree(zip_extraction_path)

    # add convenience dict for mapping a msgid to msgstr
    main_pofile.msgid_mapping = {m.msgid: m.msgstr for m in main_pofile if m.translated()}

    return main_pofile
Exemplo n.º 11
0
def retrieve_assessment_item_data(assessment_item, lang=None, force=False) -> (dict, [str]):
    """
    Retrieve assessment item data and images for a single assessment item.
    :param assessment_item: id of assessment item
    :param lang: language to retrieve data in
    :param force: refetch assessment item and images even if it exists on disk
    :return: tuple of dict of assessment item data and list of paths to files
    """
    if lang:
        url = "http://www.khanacademy.org/api/v1/assessment_items/{assessment_item}?lang={lang}".format(lang=lang)
        filename = "assessment_items/{assessment_item}_{lang}.json".format(lang=lang)
    else:
        url = "http://www.khanacademy.org/api/v1/assessment_items/{assessment_item}"
        filename = "assessment_items/{assessment_item}.json"
    try:
        url = url.format(assessment_item=assessment_item)
        filename = filename.format(assessment_item=assessment_item)
        path = download_assessment_item_data(url, filename=filename, lang=lang, force=force)
    except requests.RequestException:
        logging.error("Download failure for assessment item: {assessment_item}".format(assessment_item=assessment_item))
        raise

    with open(path, "r") as f:
        item_data = json.load(f)

    image_urls = find_all_image_urls(item_data)
    graphie_urls = find_all_graphie_urls(item_data)

    file_paths = []

    for url in itertools.chain(image_urls, graphie_urls):
        filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get(url, os.path.basename(url))
        filepath = _get_subpath_from_filename(filename)
        file_paths.append(download_and_cache_file(url, filename=filepath))

    item_data = localize_image_urls(item_data)
    item_data = localize_content_links(item_data)
    item_data = localize_graphie_urls(item_data)

    return item_data, file_paths
Exemplo n.º 12
0
    def test_returns_existing_file(self):
        url = "https://google.com"
        path = download_and_cache_file(url)

        assert os.path.exists(path)
Exemplo n.º 13
0
 def _download_image_urls(url):
     filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get(
         url, os.path.basename(url))
     filepath = _get_subpath_from_filename(filename)
     return download_and_cache_file(url, filename=filepath)
Exemplo n.º 14
0
 def _download_image_urls(url):
     filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get(url, os.path.basename(url))
     filepath = _get_subpath_from_filename(filename)
     return download_and_cache_file(url, filename=filepath)
Exemplo n.º 15
0
    def test_returns_existing_file(self):
        url = "https://google.com"
        path = download_and_cache_file(url)

        assert os.path.exists(path)