示例#1
0
def download_and_convert_mpeg_file(mpeg_url):
    """
    Kolibri AudioNode only support .mp3 files and not .mpeg, so we must convert.
    """
    mpeg_filename = mpeg_url.split('/')[-1]  # e.g. something.wav
    mpeg_path = os.path.join(DOWNLOADED_MPEG_FILES_DIR, mpeg_filename)
    print(mpeg_path, 'mpeg_path')

    # 1. DOWNLOAD
    download_file(mpeg_url, DOWNLOADED_MPEG_FILES_DIR)
    print("mpeg downloaded")

    # 2. CONVERT
    mp3_filename = mpeg_filename.replace('.mpeg', '.mp3')
    mp3_path = os.path.join(CONVERTED_MP3_FILES_DIR, mp3_filename)
    print(mp3_filename, mp3_path)
    if not os.path.exists(mp3_path):
        try:
            command = [
                "ffmpeg", "-i", mpeg_path, "-acodec", "mp3", "-ac", "2", "-ab",
                "64k", "-y", "-hide_banner", "-loglevel", "warning", mp3_path
            ]
            subprocess.check_call(command)
            print("Successfully converted mpeg file to mp3")
        except subprocess.CalledProcessError:
            print("Problem converting " + mpeg_url)
            return None

    # Return path of converted mp3 file
    return mp3_path
示例#2
0
def download_and_convert_png_file(png_url):
    """
    Kolibri VideoNode only support .mp4 files and not .m4v, so we must convert.
    """
    m4v_filename = png_url.split('/')[-1]  # e.g. something.wav
    m4v_path = os.path.join(DOWNLOADED_M4V_FILES_DIR, m4v_filename)
    print(m4v_path, 'm4v_path')

    # 1. DOWNLOAD M4V file
    download_file(png_url, DOWNLOADED_M4V_FILES_DIR)
    print("m4v downloaded")

    # 2. CONVERT
    mp4_filename = m4v_filename.replace('.png', '.mp4')
    mp4_path = os.path.join(CONVERTED_MP4_FILES_DIR, mp4_filename)
    print(mp4_filename, mp4_path)
    if not os.path.exists(mp4_path):
        try:
            command = [
                "ffmpeg", "-i", m4v_path, "-vcodec", "copy", "-acodec", "copy",
                mp4_path
            ]
            subprocess.check_call(command)
            print("Successfully converted m4v file to mp4")
        except subprocess.CalledProcessError:
            print("Problem converting " + png_url)
            return None

    # Return path of converted mp4 file
    return mp4_path
def process_wikipedia_page(content, baseurl, destpath, **kwargs):
    """ Saves images to html zip folder """
    page = BeautifulSoup(content, "html.parser")
    index = 0

    # Add style sheets to zip file
    for link in page.find_all("link"):
        if link.get('href') and 'stylesheet' in link['rel']:
            try:
                subpath = "item_{}".format(index)
                link["href"], _ = download_file(make_fully_qualified_url(
                    link['href']),
                                                destpath,
                                                subpath=subpath)
                index = index + 1
            except Exception:
                link["href"] = "#"

    # Add images to zip file
    for image in page.find_all("img"):
        try:
            relpath, _ = download_file(make_fully_qualified_url(image["src"]),
                                       destpath)
            image["src"] = relpath
        except Exception:
            image["src"] = "#"

    # Replace links with text to avoid broken links
    content = str(page)
    for link in page.find_all("a"):
        if link.get('href') and not link['href'].startswith("#"):
            content = content.replace(str(link), link.text)

    return content
示例#4
0
    def download_assets(selector, attr, url_middleware=None,
            content_middleware=None, node_filter=None):
        nodes = doc.select(selector)

        for i, node in enumerate(nodes):

            if node_filter:
                if not node_filter(node):
                    src = node[attr]
                    node[attr] = ''
                    print('        Skipping node with src ', src)
                    continue

            if node[attr].startswith('data:'):
                continue

            url = urljoin(base_url, node[attr])

            if _is_blacklisted(url, url_blacklist):
                print('        Skipping downloading blacklisted url', url)
                node[attr] = ""
                continue

            if url_middleware:
                url = url_middleware(url)

            filename = derive_filename(url)
            node[attr] = filename

            print("        Downloading", url, "to filename", filename)
            download_file(url, destination, request_fn=request_fn,
                    filename=filename, middleware_callbacks=content_middleware)
示例#5
0
def download_writing_topic_category(category_doc, title, level_id):
    destination = tempfile.mkdtemp()

    # Download a font
    font_url = make_fully_qualified_url(
        '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic'
    )
    download_file(font_url,
                  destination,
                  request_fn=make_request,
                  filename='roboto.css')

    # Write out the HTML source, based on CSS formatting from
    # https://k12.thoughtfullearning.com/resources/writingtopics

    topics = (("<li>%s</li>" % topic.text)
              for topic in category_doc.select('.views-row'))
    html_source = """
        <!DOCTYPE html>
        <head>
            <link href='roboto.css' rel='stylesheet' type='text/css'>
            <style>
                ul {
                    margin: 0 0 0 40px;
                    padding: 0;
                }
                li {
                    font-family: "Roboto", sans-serif;
                    font-weight: 300;
                    font-size: 19.2px;
                    line-height: 24.96px;
                    color: #202020;
                    margin-top: 10px;
                }
            </style>
        </head>
        <body>
            <ul>%s</ul>
        </body>
    """ % ''.join(topics)

    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(html_source)

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id="%s|%s" % (level_id, title),
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        files=[files.HTMLZipFile(zip_path)],
        language="en",
        thumbnail=writing_topic_thumbnail,
    )
 def repl(match):
     src = match.group(1)
     if src.startswith('//localhost'):
         return 'src()'
     # Don't download data: files
     if src.startswith('data:'):
         return match.group(0)
     src_url = make_fully_qualified_url(src)
     derived_filename = derive_filename(src_url)
     download_file(src_url, destination, request_fn=make_request,
             filename=derived_filename)
     return 'src("%s")' % derived_filename
def download_assets(doc, selector, attr, destination, middleware=None):
    """
    Find all assets in `attr` for DOM elements that match `selector` within doc
    and download them to `destination` dir.
    """
    nodes = doc.select(selector)
    for i, node in enumerate(nodes):
        url = make_fully_qualified_url(node[attr])
        filename = "%s_%s" % (i, os.path.basename(url))
        node[attr] = filename
        download_file(url,
                      destination,
                      request_fn=make_request,
                      filename=filename,
                      middleware_callbacks=middleware)
def download_wikipedia_page(url, thumbnail, title):
    """ Create zip file to use for html pages """
    destpath = tempfile.mkdtemp(
    )  # Create a temp directory to house our downloaded files

    # downlod the main wikipedia page, apply a middleware processor, and call it index.html
    localref, _ = download_file(
        url,
        destpath,
        filename="index.html",
        middleware_callbacks=process_wikipedia_page,
    )

    zippath = create_predictable_zip(
        destpath)  # Turn the temp folder into a zip file

    # Create an HTML5 app node
    html5app = nodes.HTML5AppNode(
        files=[files.HTMLZipFile(zippath)],
        title=title,
        thumbnail=thumbnail,
        source_id=url.split("/")[-1],
        license=CHANNEL_LICENSE,
    )

    return html5app
示例#9
0
    def js_middleware(content, url, **kwargs):
        # Download all images referenced in JS files
        for img in IMAGES_IN_JS_RE.findall(content):
            url = make_fully_qualified_url('/images/%s' % img)
            print("        Downloading", url, "to filename", img)
            download_file(url,
                          destination,
                          subpath="images",
                          request_fn=make_request,
                          filename=img)

        # Polyfill localStorage and document.cookie as iframes can't access
        # them
        return (content.replace("localStorage", "_localStorage").replace(
            'document.cookie.split',
            '"".split').replace('document.cookie', 'window._document_cookie'))
示例#10
0
def download_wikipedia_page(url, thumbnail, title):
    # create a temp directory to house our downloaded files
    destpath = tempfile.mkdtemp()

    # downlod the main wikipedia page, apply a middleware processor, and call it index.html
    localref, _ = download_file(
        url,
        destpath,
        filename="index.html",
        middleware_callbacks=process_wikipedia_page,
        request_fn=make_request,
    )

    # turn the temp folder into a zip file
    zippath = create_predictable_zip(destpath)

    # create an HTML5 app node
    html5app = HTML5AppNode(
        files=[HTMLZipFile(zippath)],
        title=title,
        thumbnail=thumbnail,
        source_id=url.split("/")[-1],
        license=licenses.PublicDomainLicense(),
    )

    return html5app
def get_phet_zip_file(zip_file_url, main_file_and_query):
    """
    Phet simulations are provided in the zip file `phet.zip`, and the entry point
    is passed as a GET parameter in `main_file_and_query`. To make these compatible
    with Kolibri's default behaviour of loading index.html, we will:
      - Rename index.html to phetindex.thml
      - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id}
    """
    u = urlparse(main_file_and_query)
    idk, sim_id = u.query.split('=')
    assert idk == 'id', 'unknown query sting format found' + main_file_and_query
    main_file = u.scheme + '://' + u.netloc + u.path  # skip querystring

    destpath = tempfile.mkdtemp()
    LOGGER.info('saving phet zip file in dir ' + destpath)
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # Rename main_file to phetindex.html
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'phetindex.html')
        os.rename(src, dest)

        # Create the
        index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id)
        with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf:
            indexf.write(index_html)

        # Always be zipping!
        return create_predictable_zip(zip_folder)

    except Exception as e:
        LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file_and_query, destpath, e))
        return None
示例#12
0
def process_wikipedia_page(content, baseurl, destpath, **kwargs):

    page = BeautifulSoup(content, "html.parser")

    for image in page.find_all("img"):
        relpath, _ = download_file(make_fully_qualified_url(image["src"]), destpath, request_fn=make_request)
        image["src"] = relpath

    return str(page)
示例#13
0
        def repl(match):
            src = match.group(1)
            if src.startswith('//localhost'):
                return 'url()'
            # Don't download data: files
            if src.startswith('data:'):
                return match.group(0)
            src_url = urljoin(base_url, os.path.join(file_dir, src))

            if _is_blacklisted(src_url, url_blacklist):
                print('        Skipping downloading blacklisted url', src_url)
                return 'url()'

            derived_filename = derive_filename(src_url)
            download_file(src_url,
                          destination,
                          request_fn=request_fn,
                          filename=derived_filename)
            return 'url("%s")' % derived_filename
示例#14
0
        def repl(match):
            src = match.group(1)

            if src.startswith('//localhost'):
                return 'url()'
            # Don't download data: files
            if src.startswith('data:'):
                return match.group(0)
            parts = urlparse(src)
            root_url = None
            if url:
                root_url = url[:url.rfind('/') + 1]

            if parts.scheme and parts.netloc:
                src_url = src
            elif parts.path.startswith('/') and url:
                src_url = '{}://{}{}'.format(root_parts.scheme, root_parts.netloc, parts.path)
            elif url and root_url:
                src_url = urljoin(root_url, src)
            else:
                src_url = urljoin(base_url, src)

            if _is_blacklisted(src_url, url_blacklist):
                print('        Skipping downloading blacklisted url', src_url)
                return 'url()'

            derived_filename = derive_filename(src_url)

            # The _derive_filename function puts all files in the root, so all URLs need
            # rewritten. When using get_archive_filename, relative URLs will still work.
            new_url = src
            if derive_filename == _derive_filename:
                if url and parts.path.startswith('/'):
                    parent_url = derive_filename(url)
                    new_url = os.path.relpath(src, os.path.dirname(parent_url))
                else:
                    new_url = derived_filename

            download_file(src_url, destination, request_fn=request_fn,
                    filename=derived_filename)
            return 'url("%s")' % new_url
def download_assets_from_github(repo_name, repo_path, destination):
    print('        Downloading files from GitHub repo %s/%s ...' %
          (repo_name, repo_path))

    access_token_param = ''
    if _GITHUB_API_TOKEN:
        access_token_param = '&access_token=%s' % _GITHUB_API_TOKEN

    url = 'https://api.github.com/repos/%s/contents/%s?ref=master%s' % (
        repo_name, repo_path, access_token_param)
    response = make_request(url)

    for item in response.json():
        if item['type'] == 'file':
            filename = item['name']
            download_url = item['download_url']
            print('        Downloading %s' % download_url)
            download_file(download_url,
                          destination,
                          request_fn=make_request,
                          filename=filename)
    def js_middleware(content, url, **kwargs):
        if DEBUG_MODE:
            print('in js_middleware', url)
        # Download all images referenced in JS files
        for img in IMAGES_IN_JS_RE.findall(content):
            url = make_fully_qualified_url('/images/%s' % img)
            print("Downloading", url, "to filename", img)
            download_file(url, destination, subpath="images",
                    request_fn=make_request, filename=img)

        # Monkey-patch the js code that use localStorage and document.cookie so
        # to use window._localStorage (a plain js object) instead real localStorage
        # This change primarily affects the functions getStoredValue and setStoredValue
        # which are used to set the following properties:
        #  - diffRange: sets age-range for stories (needed to avoid a dialog popup)
        #  - lng: set to arabic
        #  - audio: toggles between read-aloud vs. no read-aloud
        return (content
            .replace("localStorage", "_localStorage")
            .replace('document.cookie.split', '"".split')
            .replace('document.cookie', 'window._document_cookie'))
    def download_assets(selector, attr, url_middleware=None,
            content_middleware=None, node_filter=None):
        nodes = doc.select(selector)

        for i, node in enumerate(nodes):

            if node_filter:
                if not node_filter(node):
                    src = node[attr]
                    # node[attr] = ''
                    node.decompose()
                    print('Skipping node with src ', src)
                    continue

            # Remove preconnect and preload links form header
            relattr = node.get('rel', None)
            if relattr in ['preconnect', 'preload', 'apple-touch-icon']:
                node.decompose()
                continue

            url = make_fully_qualified_url(node[attr])

            if is_blacklisted(url):
                print('Skipping downloading blacklisted url', url)
                node.decompose()
                # node[attr] = ""
                continue

            if url_middleware:
                url = url_middleware(url)

            filename = derive_filename(url)
            node[attr] = filename

            print("Downloading", url, "to filename", filename)
            download_file(url, destination, request_fn=make_request,
                    filename=filename, middleware_callbacks=content_middleware)
def get_zip_file(zip_file_url, main_file):
    """HTML games are provided as zip files, the entry point of the game is
     main_file. main_file needs to be renamed to index.html to make it
     compatible with Kolibri.
    """
    destpath = tempfile.mkdtemp()
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # In some cases, the files are under the www directory,
        # let's move them up one level.
        www_dir = os.path.join(zip_folder, 'www')
        if os.path.isdir(www_dir):
            files = os.listdir(www_dir)
            for f in files:
                shutil.move(os.path.join(www_dir, f), zip_folder)

        # Rename main_file to index.html.
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'index.html')
        os.rename(src, dest)

        return create_predictable_zip(zip_folder)
    except Exception as e:
        LOGGER.error("get_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file, destpath, e))
        return None
def download_wikipedia_page(url, title, writer, thumbnail=None):
    """ Create zip file to use for html pages """
    destpath = tempfile.mkdtemp(
    )  # Create a temp directory to house our downloaded files

    # Generate details for files
    details = {
        'thumbnail': thumbnail,
        'source_id': url.split("/")[-1],
        'license': CHANNEL_LICENSE,
    }

    # Download the main wikipedia page, apply middleware processor, and call it index.html
    localref, _ = download_file(url,
                                destpath,
                                filename="index.html",
                                middleware_callbacks=process_wikipedia_page)

    zippath = create_predictable_zip(
        destpath)  # Turn the temp folder into a zip file
    writer.add_file(str(PATH), title, zippath, **details)
def download_additional_assets(destination, puzzle_name):
    url = make_fully_qualified_url('/third-party/JS-Interpreter/compiled.js')
    download_file(url,
                  os.path.join(destination, 'third-party/JS-Interpreter'),
                  request_fn=make_request,
                  filename='compiled.js')

    dir_name = puzzle_name
    if dir_name == 'pond-tutor' or dir_name == 'pond-duck':
        dir_name = 'pond'

        url = make_fully_qualified_url('/pond/docs/generated/en/compressed.js')
        download_file(url,
                      os.path.join(destination, 'pond/docs/generated/en'),
                      request_fn=make_request,
                      filename='compressed.js')

        url = make_fully_qualified_url('third-party/ace/worker-javascript.js')
        download_file(url,
                      destination,
                      request_fn=make_request,
                      filename='worker-javascript.js')

        download_assets_from_github('google/blockly-games',
                                    'appengine/pond/docs',
                                    os.path.join(destination, 'pond/docs'))

    download_assets_from_github('google/blockly-games',
                                'appengine/%s' % dir_name,
                                os.path.join(destination, dir_name))
    download_assets_from_github('google/blockly-games',
                                'appengine/%s' % dir_name, destination)
    download_assets_from_github('google/blockly-games', 'appengine/common',
                                os.path.join(destination, 'common'))
    download_assets_from_github('google/blockly', 'media', destination)
    download_assets_from_github(
        'google/blockly', 'media',
        os.path.join(destination, 'third-party/blockly/media'))
def get_zip_file(zip_file_url, main_file):
    """
    HTML games are provided as zip files, the entry point of the game is `main_file`.
    THe `main_file` needs to be renamed to index.html to make it compatible with Kolibri.
    """
    key = zip_file_url + main_file
    destpath = make_temporary_dir_from_key(key)

    # Check for "REPLACE WITH:" correction rule for the current `zip_file_url`
    replacement_url = should_replace_with(zip_file_url)
    if replacement_url:
        zip_file_url = replacement_url

    # return cached version if already there
    final_webroot_path = os.path.join(destpath, 'webroot.zip')
    if os.path.exists(final_webroot_path):
        return final_webroot_path

    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]  # e.g. Mathematics.zip
        zip_basename = zip_filename.rsplit('.', 1)[0]  # e.g. Mathematics/

        # July 31: handle ednge cases where zip filename doesn't match folder name inside it
        awazchitras = [
            'Awazchitra_HI', 'Awazchitra_TL', 'Awazchitra_KN', 'Awazchitra_BN',
            'Awazchitra_OD', 'Awazchitra_PN', 'Awazchitra_TM'
        ]
        for awazchitra in awazchitras:
            if awazchitra in zip_basename:
                zip_basename = zip_basename.replace('Awazchitra', 'AwazChitra')
        if '_KKS_Hi' in zip_basename:
            zip_basename = zip_basename.replace('_KKS_Hi', '_KKS_HI')

        # Mar 2: more edge cases where zip filename doesn't match folder name inside it
        if 'Memorygamekb' in zip_basename:
            zip_basename = zip_basename.replace('Memorygamekb', 'MemoryGamekb')
        if 'cityofstories' in zip_basename:
            zip_basename = zip_basename.replace('cityofstories',
                                                'CityOfStories')

        # Jun 12: fix more edge cases where .zip filename doesn't match dir name
        if '_KKS_Gj' in zip_basename:
            zip_basename = zip_basename.replace('_KKS_Gj', '_KKS_GJ')
        if 'ShabdKhel' in zip_basename:
            zip_basename = zip_basename.replace('ShabdKhel', 'Shabdkhel')

        zip_folder = os.path.join(destpath,
                                  zip_basename)  # e.g. destpath/Mathematics/
        main_file = main_file.split('/')[
            -1]  # e.g. activity_name.html or index.html

        if 'KhelbadiKahaniyan_MR' in zip_basename:
            # Inconsistency --- `main_file` contains dir name, and not index.html
            main_file = 'index.html'

        # Jul 8th: handle weird case-insensitive webserver main_file
        if main_file == 'mainexpand.html':
            main_file = 'mainExpand.html'  # <-- this is the actual filename in the zip

        # Zip files from Pratham website have the web content inside subfolder
        # of the same as the zip filename. We need to recreate these zip files
        # to make sure the index.html is in the root of the zip.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            # If main_file is in the root (like zips from the game repository)
            # then we need to extract the zip contents to subfolder zip_basename/
            for zfileinfo in zf.filelist:
                if zfileinfo.filename == main_file:
                    destpath = os.path.join(destpath, zip_basename)
            # Extract zip so main file will be in destpath/zip_basename/index.html
            zf.extractall(destpath)

        # In some cases, the files are under the www directory,
        # let's move them up one level.
        www_dir = os.path.join(zip_folder, 'www')
        if os.path.isdir(www_dir):
            files = os.listdir(www_dir)
            for f in files:
                shutil.move(os.path.join(www_dir, f), zip_folder)

        # Rename `main_file` to index.html
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'index.html')
        os.rename(src, dest)

        # Logic to add margin-top:44px; for games that match Corrections tab
        add_margin_top = False
        for row in PRADIGI_CORRECTIONS_LIST:
            if row[CORRECTIONS_ACTION_KEY] == ADD_MARGIN_TOP_ACTION:
                pat = row[CORRECTIONS_SOURCE_URL_PAT_KEY]
                m = pat.match(zip_file_url)
                if m:
                    add_margin_top = True
        if add_margin_top:
            if zip_file_url.endswith('CourseContent/Games/Mathematics.zip'):
                LOGGER.info(
                    "adding body.margin-top:44px; to ALL .html files in: %s" %
                    zip_file_url)
                for root, dirs, files in os.walk(zip_folder):
                    for file in files:
                        if file.endswith(".html"):
                            add_body_margin_top(root, file)
            else:
                LOGGER.info(
                    "adding body.margin-top:44px; to index.html in: %s" %
                    zip_file_url)
                add_body_margin_top(zip_folder, 'index.html')

        # Replace occurences of `main_file` with index.html to avoid broken links
        for root, dirs, files in os.walk(zip_folder):
            for file in files:
                if file.endswith(".html") or file.endswith(".js"):
                    file_path = os.path.join(root, file)
                    # use bytes to avoid Unicode errors "invalid start/continuation byte"
                    bytes_in = open(file_path, 'rb').read()
                    bytes_out = bytes_in.replace(main_file.encode('utf-8'),
                                                 b'index.html')
                    open(file_path, 'wb').write(bytes_out)

        # create the zip file and copy it to
        tmp_predictable_zip_path = create_predictable_zip(zip_folder)
        shutil.copyfile(tmp_predictable_zip_path, final_webroot_path)
        return final_webroot_path

    except Exception as e:
        LOGGER.error("get_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file, destpath, e))
        return None
def scrape_content(title, content_url):
    """
    title: Boys' clothing
    content_url: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("    Scraping content node: %s (%s)" % (title, content_url))

    doc = get_parsed_html_from_url(content_url)
    if not doc:  # 404
        return None

    description = create_description(doc)
    source_id = doc.select_one(".current_post.active .post_id")["value"]

    base_node_attributes = {
        "source_id": source_id,
        "title": title,
        "license": TE_LICENSE,
        "description": description,
    }

    youtube_iframe = doc.select_one(".video-container iframe")
    if youtube_iframe:
        youtube_url = doc.select_one(".video-container iframe")["src"]
        youtube_id = get_youtube_id_from_url(youtube_url)

        if not youtube_id:
            print("    *** WARNING: youtube_id not found for content url",
                  content_url)
            print("    Skipping.")
            return None

        try:
            info = ydl.extract_info(youtube_url, download=False)
            subtitles = info.get("subtitles")
            subtitle_languages = subtitles.keys() if subtitles else []
            print("      ... with subtitles in languages:", subtitle_languages)
        except youtube_dl.DownloadError as e:
            # Some of the videos have been removed from the YouTube channel --
            # skip creating content nodes for them entirely so they don't show up
            # as non-loadable videos in Kolibri.
            print("        NOTE: Skipping video download due to error: ", e)
            return None

        video_node = nodes.VideoNode(
            **base_node_attributes,
            derive_thumbnail=True,
            files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)],
        )

        # Add subtitles in whichever languages are available.
        for language in subtitle_languages:
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))

        return video_node

    img = doc.select_one(".uncode-single-media-wrapper img")
    if img:
        img_src = img["data-guid"] or img["src"]
        destination = tempfile.mkdtemp()
        download_file(img_src,
                      destination,
                      request_fn=make_request,
                      filename="image.jpg")

        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write("""
                <!doctype html>
                <html>
                <head></head>
                <body>
                    <img src="image.jpg" style="width: 100%; max-width: 1200px;" />
                </body>
                </html>
            """)

        zip_path = create_predictable_zip(destination)

        return nodes.HTML5AppNode(
            **base_node_attributes,
            files=[files.HTMLZipFile(zip_path)],
            thumbnail=img_src,
        )

    return None
def overlay_and_watermark_video(filename, youtube_id):
    # Check if we've processed this file before -- is it in the cache?
    key = files.generate_key("WATERMARKED",
                             filename,
                             settings=WATERMARK_SETTINGS)
    if not config.UPDATE and files.FILECACHE.get(key):
        return files.FILECACHE.get(key).decode('utf-8')

    # Create a temporary filename to write the watermarked video.
    tempf = tempfile.NamedTemporaryFile(suffix=".{}".format(file_formats.MP4),
                                        delete=False)
    tempf.close()
    tempfile_name = tempf.name

    # Now watermark it with the Touchable Earth logo!
    print("\t--- Watermarking and adding overlay ", filename)

    # First add the overlay image -- this is the image shown as the first frame
    # so that when the video hasn't been played yet, it will show this image
    # rather than a black screen (since Touchable Earth's videos start from
    # a blank black screen).

    # Download the overlay image based on the YouTube ID
    overlay_src = 'https://i.ytimg.com/vi_webp/%s/maxresdefault.webp' % youtube_id
    print("\t    ... grabbing overlay image from %s" % overlay_src)
    destination = tempfile.mkdtemp()
    overlay_filename = "overlay.webp"
    overlay_file = os.path.join(destination, overlay_filename)
    _, response = download_file(overlay_src,
                                destination,
                                request_fn=sess.get,
                                filename=overlay_filename)

    video_clip = mpe.VideoFileClip(config.get_storage_path(filename),
                                   audio=True)

    if response.status_code == 200:
        overlay_clip = mpe.ImageClip(overlay_file).set_duration(0.1)
        concat_clips = mpe.concatenate_videoclips([overlay_clip, video_clip])
    else:
        concat_clips = video_clip
        print("\t    WARNING: Could not download overlay image file from %s" %
              overlay_src)

    # Now create the watermark logo as a clip ...
    logo = (mpe.ImageClip(WATERMARK_SETTINGS["image"]).set_duration(
        concat_clips.duration).resize(
            height=WATERMARK_SETTINGS["height"]).margin(
                right=WATERMARK_SETTINGS["right"],
                bottom=WATERMARK_SETTINGS["bottom"],
                opacity=0).set_pos(WATERMARK_SETTINGS["position"]))

    # And then combine it with the video clip.
    composite = mpe.CompositeVideoClip([concat_clips, logo])
    composite.duration = concat_clips.duration
    composite.write_videofile(tempfile_name, threads=4)

    # Now move the watermarked file to Ricecooker storage and hash its name
    # so it can be validated.
    watermarked_filename = "{}.{}".format(files.get_hash(tempfile_name),
                                          file_formats.MP4)
    files.copy_file_to_storage(watermarked_filename, tempfile_name)
    os.unlink(tempfile_name)
    os.unlink(overlay_file)

    files.FILECACHE.set(key, bytes(watermarked_filename, "utf-8"))
    return watermarked_filename
示例#24
0
    def download_sim(self, topic, sim, keywords, language):
        """
        Download, zip, and add a node for a sim, as well as any associated video.
        """

        localized_sim = sim["localizedSimulations"][0]

        print("\tProcessing sim:", localized_sim["title"])

        dst = tempfile.mkdtemp()
        download_file(
            localized_sim["downloadUrl"],
            dst,
            filename="index.html",
            request_fn=sess.get,
            middleware_callbacks=[process_sim_html],
        )

        zippath = create_predictable_zip(dst)

        authors = re.sub(" \(.*?\)", "", sim["credits"]["designTeam"])
        authors = re.sub("<br\/?>", ", ", authors)

        title = localized_sim["title"]
        if language == "ar":
            if title in ARABIC_NAME_CATEGORY:
                title = ARABIC_NAME_CATEGORY[title]
            if title in SIM_TYPO:
                title = SIM_TYPO[title]

        # create a node for the sim
        simnode = HTML5AppNode(
            source_id="sim-%d" % localized_sim["id"],
            files=[HTMLZipFile(zippath)],
            title=title,
            description=sim["description"][language][:200],
            license=CC_BYLicense(
                "PhET Interactive Simulations, University of Colorado Boulder"
            ),
            # author=authors,
            # tags=[keywords[topic] for topic in sim["topicIds"]],
            thumbnail=sim["media"]["thumbnailUrl"],
            language=getlang(language),
        )

        # if there's a video, extract it and put it in the topic right before the sim
        videos = sim["media"]["vimeoFiles"]
        if videos:
            video_url = [v for v in videos
                         if v.get("height") == 540][0]["link"]

            videonode = VideoNode(
                source_id="video-%d" % localized_sim["id"],
                files=[VideoFile(video_url)],
                title="Video: %s" % localized_sim["title"],
                license=CC_BYLicense(
                    "PhET Interactive Simulations, University of Colorado Boulder"
                ),
                thumbnail=sim["media"]["thumbnailUrl"],
            )

            topic.add_child(videonode)

        # add the sim node into the topic
        topic.add_child(simnode)
示例#25
0
def download_content_node(category_node,
                          url,
                          title,
                          thumbnail=None,
                          description=None):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc,
                                 destination,
                                 'https://k12.thoughtfullearning.com',
                                 request_fn=make_request,
                                 url_blacklist=url_blacklist)

    remove_node(doc, '#header')
    remove_node(doc, '.subMenuBarContainer')
    remove_node(doc, '.breadbookmarkcontainer')
    remove_node(doc, '.resourcePageTypeTitle')
    remove_node(doc, '.sharethis-wrapper')
    remove_node(doc, '.ccBlock')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block')
    remove_node(doc, '.productSuggestionContainer')
    remove_node(doc, 'footer')

    # For minilessons
    remove_node(doc, '.field-name-field-minilesson-downloadables')

    # For writing assessments
    remove_node(doc, '.assessmentTGLink')
    remove_node(doc, '.assessmentModelRubrics')
    remove_node(doc, '.view-display-id-attachment_1')

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    thumbnail_path = None
    if thumbnail:
        # Manually download the thumbnail and use it so we can lowercase the
        # extension to be accepted by Ricecooker.
        thumbnail_filename = derive_filename(thumbnail)
        thumbnail_path = os.path.join(destination, thumbnail_filename)
        download_file(thumbnail,
                      destination,
                      request_fn=make_request,
                      filename=thumbnail_filename)

    # If there is an embedded video in the page source grab it as a video node.
    video_node = None
    iframe = doc.select_one('.embedded-video iframe')
    if iframe:
        youtube_url = iframe['src']
        youtube_id = get_youtube_id_from_url(youtube_url)
        info = ydl.extract_info(youtube_url, download=False)
        video_title = info['title']
        print(
            "    ... and with video titled %s from www.youtube.com/watch?v=%s"
            % (video_title, youtube_id))
        video_node = nodes.VideoNode(
            source_id=youtube_id,
            title=truncate_metadata(info['title']),
            license=licenses.CC_BY_NC_SALicense(
                copyright_holder=truncate_metadata('Thoughtful Learning')),
            description=info['description'],
            language="en",
            derive_thumbnail=True,
            files=[files.YouTubeVideoFile(youtube_id)],
        )
        category_node.add_child(video_node)

    zip_path = create_predictable_zip(destination)
    app_node = nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        description=description,
        thumbnail=thumbnail_path,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

    category_node.add_child(app_node)