def scrape_resource(url, topic):
    resource = BeautifulSoup(downloader.read(url), 'html5lib')
    LOGGER.info('      {}'.format(resource.find('h2').text))

    filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href'])
    license = None
    author = ''
    for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'):
        if 'Licencia' in data_section.text:
            try:
                license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal")
            except KeyError as e:
                LOGGER.error(str(e))
                license = licenses.CC_BYLicense
        elif 'Autor' in data_section.text:
            author = data_section.find_next_sibling('p').text
    if filepath:
        thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src']
        if thumbnail.endswith('.gif'):
            thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')])
            with open(thumbnail, 'wb') as fobj:
                fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src']))

        topic.add_child(nodes.HTML5AppNode(
            title=resource.find('h2').text,
            source_id=url,
            license=license,
            author=author,
            description=resource.find('form').find_all('p')[1].text,
            thumbnail=thumbnail,
            tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})],
            files=[files.HTMLZipFile(path=filepath)],
        ))
예제 #2
0
    def create_dependency_zip(self):
        """
        Create a zip of the shared assets that are referenced by the other zip files.
        """
        pie_subdir = "PIE"
        dep_zip_temp_dir = os.path.join(self.temp_dir, 'dep_zip')
        os.makedirs(dep_zip_temp_dir)

        # Copy over the assets directory'
        assets_temp_dir = os.path.join(dep_zip_temp_dir, self.assets_path_rel)
        shutil.copytree(self.assets_dir, assets_temp_dir)

        # Copy over the PIE shared libraries in the apps folder
        pie_dir = os.path.join(self.apps_path, pie_subdir)
        pie_temp_dir = os.path.join(dep_zip_temp_dir, pie_subdir)
        shutil.copytree(pie_dir, pie_temp_dir)
        for afile in os.listdir(pie_temp_dir):
            if 'three.js' in afile.lower() or 'three.min.js' in afile.lower():
                self.patch_three_js(os.path.join(pie_temp_dir, afile))
        # ricecooker requires all zips to have an index.html, even dependency zips right now.
        # FIXME: Have ricecooker check is_primary before alerting about missing index.html.
        index_file = os.path.join(dep_zip_temp_dir, 'index.html')
        f = open(index_file, 'w')
        f.write('')
        f.close()

        self.dep_zip = self.create_zip_from_dir(dep_zip_temp_dir)
        self.dep_zip_file = files.HTMLZipFile(
            self.dep_zip, preset=format_presets.HTML5_DEPENDENCY_ZIP)
def download_wikipedia_page(url, thumbnail, title):
    """ Create zip file to use for html pages """
    destpath = tempfile.mkdtemp(
    )  # Create a temp directory to house our downloaded files

    # downlod the main wikipedia page, apply a middleware processor, and call it index.html
    localref, _ = download_file(
        url,
        destpath,
        filename="index.html",
        middleware_callbacks=process_wikipedia_page,
    )

    zippath = create_predictable_zip(
        destpath)  # Turn the temp folder into a zip file

    # Create an HTML5 app node
    html5app = nodes.HTML5AppNode(
        files=[files.HTMLZipFile(zippath)],
        title=title,
        thumbnail=thumbnail,
        source_id=url.split("/")[-1],
        license=CHANNEL_LICENSE,
    )

    return html5app
def scrape_snack_subject(slug, topic):
    """ Scrape snack subject page
        Args:
            slug (str): url slug to scrape from (e.g. /subject/arts)
            topic (TopicNode): topic to add html nodes to
    """
    contents = BeautifulSoup(read(slug), 'html5lib')

    for activity in contents.find_all('div', {'class': 'activity'}):
        LOGGER.info("        {}".format(activity.find('h5').text.strip()))
        # Scrape snack pages into zips
        write_to_path, tags = scrape_snack_page(activity.find('a')['href'])
        if not write_to_path:
            continue

        # Create html node
        description = activity.find('div', {'class': 'pod-description'})
        topic.add_child(
            nodes.HTML5AppNode(
                source_id=activity.find('a')['href'],
                title=activity.find('h5').text.strip().replace("’", "'"),
                description=description.text.strip() if description else "",
                license=LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
                files=[files.HTMLZipFile(path=write_to_path)],
                thumbnail=get_thumbnail_url(activity.find('img')['src']),
                tags=tags,
            ))

    # Scrape next page (if any)
    next_page_url = get_next_page_url(contents)
    if next_page_url:
        scrape_snack_subject(next_page_url, topic)
예제 #5
0
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def download_puzzle(puzzle_url, title, description, thumbnail,
                    le_language_code, blockly_language_code):
    """Download a single puzzle and return an HTML5 app node."""
    with WebDriver("https://blockly-games.appspot.com/%s" % puzzle_url,
                   delay=1000) as driver:
        doc = BeautifulSoup(driver.page_source, "html.parser")

    # Create a temporary folder to download all the files for a puzzle.
    destination = tempfile.mkdtemp()

    # Download all the JS/CSS/images/audio/etc we can get from scraping the
    # page source.
    doc = download_static_assets(doc,
                                 destination,
                                 'https://blockly-games.appspot.com',
                                 request_fn=make_request,
                                 url_blacklist=['analytics.js'])

    # Download other files not picked up by the above generic assets fetching,
    # e.g. from GitHub.
    puzzle_name = puzzle_url.split('?')[0]
    download_additional_assets(destination, puzzle_name)

    # Make some modifications to the HTML source -- hide some elements.
    remove_node(doc, '#languageMenu')
    remove_node(doc, '#title')

    # Copy over some of our own JS/CSS files and then add links to them in the
    # page source.
    copy_tree("static", os.path.join(destination, "static"))

    chef_body_script = doc.new_tag("script", src="static/chef_end_of_body.js")
    doc.select_one('body').append(chef_body_script)

    chef_head_script = doc.new_tag("script")
    chef_head_script.string = 'window["BlocklyGamesLang"] = "%s";' % blockly_language_code
    doc.select_one('head').insert(0, chef_head_script)

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print(
        "    Downloaded puzzle %s titled \"%s\" (thumbnail %s) to destination %s"
        % (puzzle_url, title, thumbnail, destination))
    # preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=puzzle_url,
        title=truncate_metadata(title),
        description=description,
        license=licenses.PublicDomainLicense(copyright_holder='Google'),
        thumbnail=thumbnail,
        files=[files.HTMLZipFile(zip_path)],
        language=le_language_code,
    )
예제 #7
0
def download_writing_topic_category(category_doc, title, level_id):
    destination = tempfile.mkdtemp()

    # Download a font
    font_url = make_fully_qualified_url(
        '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic'
    )
    download_file(font_url,
                  destination,
                  request_fn=make_request,
                  filename='roboto.css')

    # Write out the HTML source, based on CSS formatting from
    # https://k12.thoughtfullearning.com/resources/writingtopics

    topics = (("<li>%s</li>" % topic.text)
              for topic in category_doc.select('.views-row'))
    html_source = """
        <!DOCTYPE html>
        <head>
            <link href='roboto.css' rel='stylesheet' type='text/css'>
            <style>
                ul {
                    margin: 0 0 0 40px;
                    padding: 0;
                }
                li {
                    font-family: "Roboto", sans-serif;
                    font-weight: 300;
                    font-size: 19.2px;
                    line-height: 24.96px;
                    color: #202020;
                    margin-top: 10px;
                }
            </style>
        </head>
        <body>
            <ul>%s</ul>
        </body>
    """ % ''.join(topics)

    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(html_source)

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id="%s|%s" % (level_id, title),
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        files=[files.HTMLZipFile(zip_path)],
        language="en",
        thumbnail=writing_topic_thumbnail,
    )
예제 #8
0
    def construct_channel(self, *args, **kwargs):
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        lang_names = list(self.data.keys())
        lang_names.sort()

        for lang_name in lang_names:
            lang_data = self.data[lang_name]
            LOGGER.info("Creating app for language: {}".format(lang_name))
            lang = languages.getlang_by_native_name(lang_name)

            zip_dir = self.client.create_zip_dir_for_page(lang_data['url'])

            soup = self.client.get_page_soup(lang_data['url'])

            # Remove the translation list if found
            translations = soup.find('div', {'id': 'translations'})
            if translations:
                translations.extract()

            # Grab the localized title
            title = soup.find('span', {'id': 'share_title'}).text

            # Save the modified index.html page
            thumbnail = None
            for resource in lang_data['resources']:
                if 'dp3t.png' in resource:
                    thumbnail = os.path.join(zip_dir, resource)
                    break

            with open(os.path.join(zip_dir, 'index.html'), 'wb') as f:
                f.write(soup.prettify(encoding='utf-8'))

            # create_predictable_zip ensures that the ZIP file does not change each time it's created. This
            # ensures that the zip doesn't get re-uploaded just because zip metadata changed.
            zip_file = zip.create_predictable_zip(zip_dir)
            zip_name = lang.primary_code if lang else lang_name
            zip_filename = os.path.join(self.ZIP_DIR,
                                        "{}.zip".format(zip_name))
            os.makedirs(os.path.dirname(zip_filename), exist_ok=True)
            os.rename(zip_file, zip_filename)

            topic = nodes.TopicNode(source_id=lang_name, title=lang_name)
            zip_node = nodes.HTML5AppNode(
                source_id="covid19-sim-{}".format(lang_name),
                title=title,
                files=[files.HTMLZipFile(zip_filename)],
                license=licenses.PublicDomainLicense(
                    "Marcel Salathé & Nicky Case"),
                language=lang,
                thumbnail=thumbnail)
            topic.add_child(zip_node)
            channel.add_child(topic)

        return channel
 def to_contentnode(self, title, directory=None, *args, **kwargs):
     # Generate a node based on the kind attribute
     filepath = self.to_file(directory=directory)
     if self.kind == content_kinds.HTML5:
         return nodes.HTML5AppNode(source_id=self.url,
                                   title=title,
                                   files=[files.HTMLZipFile(filepath)],
                                   **kwargs)
     elif self.kind == content_kinds.VIDEO:
         return nodes.VideoNode(source_id=self.url,
                                title=title,
                                files=[files.VideoFile(filepath)],
                                **kwargs)
예제 #10
0
    def create_topic_nodes_recursive(self, topic_info):
        """
        Create nodes for all the content items in the tree. Currently supports HTML5 app node and topic node creation.

        :param topic_info: Dictionary with information about the current topic to use for generating nodes.
        :return: A TopicNode of the node topic_info along with all child topics and nodes.
        """
        topic_node = nodes.TopicNode(source_id=str(topic_info['id']),
                                     title=topic_info['text'])

        has_content = False
        if 'nodes' in topic_info:
            has_content = True
            topic_nodes = topic_info['nodes']
            for anode in topic_nodes:
                node_files = [files.HTMLZipFile(anode['html5_zip'])]
                if 'needs_dep_zip' in anode and anode['needs_dep_zip']:
                    print("Needs dep zip: {}".format(anode))
                    node_files.append(self.dep_zip_file)
                html_node = nodes.HTML5AppNode(files=node_files,
                                               title=anode['title'],
                                               source_id=anode['dir'],
                                               license=licenses.CC_BY_NC,
                                               copyright_holder="ekShiksha")
                if 'description' in anode:
                    html_node.description = anode['description']

                # One possible way to store metadata about each content node on Studio.
                # extra_fields = {'metadata' : {}}
                # metadata = extra_fields['metadata']
                # metadata['grades'] = [{'curriculum': 'CBSE', 'grades': [int(anode['standard'])] }]
                # metadata['subject'] = topic_info['text']
                # TODO: Add the topic tree as 'categories'

                topic_node.add_child(html_node)

        if 'subtopics' in topic_info:
            for subtopic in topic_info['subtopics']:
                child = self.create_topic_nodes_recursive(subtopic)
                if child:
                    has_content = True
                    topic_node.add_child(child)

        # This shouldn't happen, so output a warning if it does.
        if not has_content:
            print("Node {} has no content".format(topic_info))
            return None

        return topic_node
예제 #11
0
def create_html5_app_node(license,
                          content_dict,
                          ims_dir,
                          scraper_class=None,
                          temp_dir=None,
                          needs_scorm_support=False):
    if scraper_class:
        index_path = os.path.join(ims_dir, content_dict['index_file'])

        if '?' in index_path:
            index_path = index_path.split('?')[0]
        if '#' in index_path:
            index_path = index_path.split('#')[0]
        if content_dict['scormtype'] == 'sco' and needs_scorm_support:
            add_scorm_support(index_path, ims_dir)

        index_uri = pathlib.Path(os.path.abspath(index_path)).as_uri()
        zip_name = '%s.zip' % hashlib.md5(
            index_uri.encode('utf-8')).hexdigest()
        temp_dir = temp_dir if temp_dir else tempfile.gettempdir()
        zip_path = os.path.join(temp_dir, zip_name)
        scraper = scraper_class(index_uri)
        scraper.download_file(zip_path)
        logging.info('Webmixer scraper outputted HTML app to %s' % zip_path)

    else:
        with tempfile.TemporaryDirectory() as destination:
            index_src_path = os.path.join(ims_dir, content_dict['index_file'])
            index_dest_path = os.path.join(destination, 'index.html')
            shutil.copyfile(index_src_path, index_dest_path)

            for file_path in content_dict['files']:
                shutil.copy(os.path.join(ims_dir, file_path), destination)

            if content_dict.get('scormtype') == 'sco' and needs_scorm_support:
                add_scorm_support(index_dest_path, destination)

            #preview_in_browser(destination)
            zip_path = create_predictable_zip(destination)

    return nodes.HTML5AppNode(
        source_id=content_dict['identifier'],
        title=content_dict.get('title'),
        license=license,
        files=[files.HTMLZipFile(zip_path)],
    )
예제 #12
0
def create_html5_app_node(license, content_dict):
    with tempfile.TemporaryDirectory() as destination:
        index_copy_path = os.path.join(destination, 'index.html')
        shutil.copyfile(content_dict['index_file'], index_copy_path)

        for file_path in content_dict['files']:
            shutil.copy(file_path, destination)

        #preview_in_browser(destination)

        zip_path = create_predictable_zip(destination)
        return nodes.HTML5AppNode(
            source_id=content_dict['identifier'],
            title=content_dict.get('title'),
            license=license,
            files=[files.HTMLZipFile(zip_path)],
        )
    def download_content(self, parent, link, params, selected_category, start):
        """
        Parse each content page.
        """
        params["start"] = start
        params.pop("format")

        # Parse each page of the result
        resp = downloader.make_request("{}/itemlist/filter".format(link),
                                       params=params)
        soup = BeautifulSoup(resp.content, "html.parser")

        # Find the all the content in each page
        for item in soup.find("tbody").find_all("a"):
            content_url = "http://proyectodescartes.org{}".format(item["href"])
            title = item.text.strip()
            source_id = item["href"].split("/")[-1]

            # Parse each content's page
            response = downloader.make_request(content_url)
            page = BeautifulSoup(response.content, "html.parser")

            thumbnail_url = "http://proyectodescartes.org{}".format(
                page.find("div", class_="itemFullText").find("img")["src"])
            author = self.get_content_author(page)
            zip_path = self.get_content_zip(page)
            if not zip_path:
                LOGGER.info(
                    "The url for the zip file does not exist in this page: {}".
                    format(content_url))
                continue

            content_node = HTML5AppNode(
                source_id=source_id,
                title=title,
                license=CC_BY_NC_SALicense(
                    copyright_holder="Proyecto Descartes"),
                language=CHANNEL_LANGUAGE,
                files=[files.HTMLZipFile(zip_path)],
                author=author,
                thumbnail=thumbnail_url,
            )

            parent.add_child(content_node)
def scrape_book(url, license):
    """ Scrape book and return html node
        e.g. https://saylordotorg.github.io/text_financial-accounting/
    """
    page = BeautifulSoup(read_source(url), 'html.parser')

    if not page.find('div', {'id': 'book-content'
                             }):  # Skip books that link to other websites
        return

    # Get fields for new html node
    title = page.find('h1').text.replace(u'\xa0', u' ').replace('\n', '')
    source_id = generate_id(title)
    write_to_path = "{}{}{}.zip".format(DOWNLOAD_DIRECTORY, os.path.sep,
                                        source_id)
    LOGGER.info("    " + title)

    # Write to html zip
    # if not os.path.isfile(write_to_path):
    with html.HTMLWriter(write_to_path) as zipper:
        # Parse table of contents
        contents = BeautifulSoup(read_source(url), 'html.parser')
        parse_page_links(url, contents, zipper)

        # Parse all links in the table of contents
        for link in contents.find_all('a'):
            if link.get('href'):
                # Get page content and write to zip
                chapter_contents = BeautifulSoup(
                    read_source(url, endpoint=link['href']), 'html.parser')
                parse_page_links(url, chapter_contents, zipper, link['href'])
                zipper.write_contents(link['href'],
                                      chapter_contents.prettify())

        # Write main index.html file and all shared files
        zipper.write_index_contents(contents.prettify())
        write_shared_library_to_zip(zipper)

    return nodes.HTML5AppNode(source_id=source_id,
                              title=title,
                              license=license,
                              copyright_holder=COPYRIGHT_HOLDER,
                              files=[files.HTMLZipFile(path=write_to_path)])
예제 #15
0
def download_content_node(url, title):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc, destination,
            'http://migranthealth.eu/', request_fn=make_request,
            url_blacklist=url_blacklist, derive_filename=derive_filename)

    nodes_to_remove = [
        'header',
        '#page-top-header',
        '#block-region-side-pre',
        '#region-main .row-fluid .span4.heading-rts',
        '.readmoreLinks',
        '.courseSectionNext',
        'img[alt="next"]',
        '.modified',
        '.footer-rts',
        '#page-footer',
        '.back-to-top',
        '.skiplinks',
        '.linkicon',
        '.generalbox table tr:nth-of-type(2)',
    ]
    for selector in nodes_to_remove:
        for node in doc.select(selector):
            node.decompose()

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("        ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=MEET_LICENSE,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )
예제 #16
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
def scrape_content(title, content_url):
    """
    title: Boys' clothing
    content_url: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("    Scraping content node: %s (%s)" % (title, content_url))

    doc = get_parsed_html_from_url(content_url)
    if not doc:  # 404
        return None

    description = create_description(doc)
    source_id = doc.select_one(".current_post.active .post_id")["value"]

    base_node_attributes = {
        "source_id": source_id,
        "title": title,
        "license": TE_LICENSE,
        "description": description,
    }

    youtube_iframe = doc.select_one(".video-container iframe")
    if youtube_iframe:
        youtube_url = doc.select_one(".video-container iframe")["src"]
        youtube_id = get_youtube_id_from_url(youtube_url)

        if not youtube_id:
            print("    *** WARNING: youtube_id not found for content url",
                  content_url)
            print("    Skipping.")
            return None

        try:
            info = ydl.extract_info(youtube_url, download=False)
            subtitles = info.get("subtitles")
            subtitle_languages = subtitles.keys() if subtitles else []
            print("      ... with subtitles in languages:", subtitle_languages)
        except youtube_dl.DownloadError as e:
            # Some of the videos have been removed from the YouTube channel --
            # skip creating content nodes for them entirely so they don't show up
            # as non-loadable videos in Kolibri.
            print("        NOTE: Skipping video download due to error: ", e)
            return None

        video_node = nodes.VideoNode(
            **base_node_attributes,
            derive_thumbnail=True,
            files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)],
        )

        # Add subtitles in whichever languages are available.
        for language in subtitle_languages:
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))

        return video_node

    img = doc.select_one(".uncode-single-media-wrapper img")
    if img:
        img_src = img["data-guid"] or img["src"]
        destination = tempfile.mkdtemp()
        download_file(img_src,
                      destination,
                      request_fn=make_request,
                      filename="image.jpg")

        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write("""
                <!doctype html>
                <html>
                <head></head>
                <body>
                    <img src="image.jpg" style="width: 100%; max-width: 1200px;" />
                </body>
                </html>
            """)

        zip_path = create_predictable_zip(destination)

        return nodes.HTML5AppNode(
            **base_node_attributes,
            files=[files.HTMLZipFile(zip_path)],
            thumbnail=img_src,
        )

    return None
예제 #18
0
def process_node_from_doc(doc, book_id, title, thumbnail):
    """
    Create a Ricecooker HTML5AppNode instance given the HTML source and metadata.
    """
    if DOWNLOAD_ONE_TO_webroot:
        # Save the book's contents to the folder `webroot` in the chef root dir.
        # Use the script ./ricecooker/utils/kolibripreview.py to preview in K
        destination = './webroot'
        if os.path.exists(destination):
            shutil.rmtree(destination)
            os.mkdir(destination)
    else:
        # Create a temporary folder to download all the files for a book
        destination = tempfile.mkdtemp()

    # Ensure the thumbnail is in a format Ricecooker can accept, and if not,
    # use the first slide as the thumbnail.
    thumbnail_extensions = ('jpg', 'jpeg', 'png')
    if not thumbnail.lower().endswith(thumbnail_extensions):
        print("Thumbnail src (%s) doesn't end in any of %s."
                " Will use the first slide as the source." % (
            thumbnail, thumbnail_extensions))
        first_slide_src = doc.select_one('#slide-container .slide img')['src']
        thumbnail = make_fully_qualified_url(first_slide_src)
        if not thumbnail.lower().endswith(thumbnail_extensions):
            thumbnail = None

    # Download all the JS/CSS/images/audio/et needed to make a standalone app
    doc = download_static_assets(doc, destination)

    # Remove a bunch of HTML that we don't want showing in our standalone app
    doc.select_one('base')['href'] = ''
    remove_node(doc, '#loading')
    remove_node(doc, '#finishedActions')
    remove_node(doc, '.bookmarkbtn')
    remove_node(doc, '.reader-expand')
    remove_node(doc, '#progressBar')
    remove_node(doc, '#androidNotification')
    remove_node(doc, '#exit')
    remove_node(doc, '#ttmenu')

    # Remove unnecessary scripts in the head
    for pat in tag_content_patterns_to_remove_in_head:
        remove_nodes_containing_pattern(doc, pat, parent_tag_name='head')
    for pat in tag_content_patterns_to_remove_in_body:
        remove_nodes_containing_pattern(doc, pat, parent_tag_name='body')
    for pat_start, pat_end in cut_start_end_patterns:
        remove_nodes_between_comments(doc, pat_start, pat_end, parent_tag_name='body')

    # Write out the HTML source
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("Downloaded book %s titled \"%s\" (thumbnail %s) to destination %s" % (
        book_id, title, thumbnail, destination))
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=book_id,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(copyright_holder='3asafeer.com'),
        thumbnail=thumbnail,
        files=[files.HTMLZipFile(zip_path)],
        language="ar",
    )
예제 #19
0
def make_topic_tree_with_entrypoints(license,
                                     imscp_zip,
                                     imscp_dict,
                                     ims_dir,
                                     temp_dir=None,
                                     parent_id=None,
                                     node_options=None):
    """Return a TopicTree node from a dict of some subset of an IMSCP manifest.

    The actual IMSCP zip is marked as a dependency, and the zip loaded by Kolibri
    only contains an index.html file that redirects to the entrypoint defined in
    the manifest. This minimizes the additional content generated for Kolibri,
    and also allows us to support content where multiple content nodes have entrypoints
    defined by parameters, e.g. index.html#chapter2, index.html#chapter3, etc.

    Ready to be uploaded via Ricecooker to Studio or used in Kolibri.

    Args:
        license - License to apply to content nodes.
        imscp_dict - Dict of IMSCP from extract_from_zip or extract_from_dir.
        ims_dir (string) - Path of directory of IMSCP
        scraper_class (webmixer.HTMLPageScraper class, optional):
            Webmixer scraper class to use for pruning an HTML page.
        temp_dir (string, optional) - Full path of temporary directory to
            output HTML zip files to.
        parent_id (string, optional) - Parent ID string to concatenate to source ID.
        node_options (dict, optional) - Options to pass to content renderer in Kolibri.
    """
    if not temp_dir:
        temp_dir = tempfile.tempdir

    source_id = imscp_dict['identifier']
    assert source_id, "{} has no identifier, parent id = {}".format(
        os.path.basename(imscp_zip), parent_id)
    if parent_id:
        source_id = '{}-{}'.format(parent_id, source_id)

    if imscp_dict.get('children'):
        topic_node = nodes.TopicNode(source_id=source_id,
                                     title=imscp_dict['title'])
        counter = 1
        for child in imscp_dict['children']:
            # We will get duplicate IDs if we don't have any ID set.
            if not child['identifier']:
                child['identifier'] = 'item{}'.format(counter)
            topic_node.add_child(
                make_topic_tree_with_entrypoints(license,
                                                 imscp_zip,
                                                 child,
                                                 ims_dir,
                                                 temp_dir=temp_dir,
                                                 parent_id=source_id,
                                                 node_options=node_options))
            counter += 1
        return topic_node
    else:
        if imscp_dict['type'] == 'webcontent':
            entrypoint_dir = os.path.join(temp_dir, 'entrypoint')
            if os.path.exists(entrypoint_dir):
                shutil.rmtree(entrypoint_dir)
            os.makedirs(entrypoint_dir)
            index = os.path.join(entrypoint_dir, "index.html")
            entrypoint_url = '/zipcontent/{}/{}'.format(
                os.path.basename(imscp_zip), imscp_dict['href'])
            f = open(index, "w", encoding="utf-8")
            f.write(ENTRYPOINT_TEMPLATE.format(entrypoint_url))
            f.close()

            zip_path = create_predictable_zip(entrypoint_dir)
            html5_node = nodes.HTML5AppNode(
                source_id=source_id,
                title=imscp_dict.get('title'),
                license=license,
                files=[
                    files.HTMLZipFile(zip_path),
                    files.HTMLZipFile(
                        imscp_zip, preset=format_presets.HTML5_DEPENDENCY_ZIP)
                ],
            )
            if node_options is not None:
                extra_data = {'options': node_options}

                html5_node.extra_fields.update(extra_data)

            return html5_node
        else:
            logging.warning('Content type %s not supported yet.' %
                            imscp_dict['type'])
예제 #20
0
def download_content_node(category_node,
                          url,
                          title,
                          thumbnail=None,
                          description=None):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc,
                                 destination,
                                 'https://k12.thoughtfullearning.com',
                                 request_fn=make_request,
                                 url_blacklist=url_blacklist)

    remove_node(doc, '#header')
    remove_node(doc, '.subMenuBarContainer')
    remove_node(doc, '.breadbookmarkcontainer')
    remove_node(doc, '.resourcePageTypeTitle')
    remove_node(doc, '.sharethis-wrapper')
    remove_node(doc, '.ccBlock')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block')
    remove_node(doc, '.productSuggestionContainer')
    remove_node(doc, 'footer')

    # For minilessons
    remove_node(doc, '.field-name-field-minilesson-downloadables')

    # For writing assessments
    remove_node(doc, '.assessmentTGLink')
    remove_node(doc, '.assessmentModelRubrics')
    remove_node(doc, '.view-display-id-attachment_1')

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    thumbnail_path = None
    if thumbnail:
        # Manually download the thumbnail and use it so we can lowercase the
        # extension to be accepted by Ricecooker.
        thumbnail_filename = derive_filename(thumbnail)
        thumbnail_path = os.path.join(destination, thumbnail_filename)
        download_file(thumbnail,
                      destination,
                      request_fn=make_request,
                      filename=thumbnail_filename)

    # If there is an embedded video in the page source grab it as a video node.
    video_node = None
    iframe = doc.select_one('.embedded-video iframe')
    if iframe:
        youtube_url = iframe['src']
        youtube_id = get_youtube_id_from_url(youtube_url)
        info = ydl.extract_info(youtube_url, download=False)
        video_title = info['title']
        print(
            "    ... and with video titled %s from www.youtube.com/watch?v=%s"
            % (video_title, youtube_id))
        video_node = nodes.VideoNode(
            source_id=youtube_id,
            title=truncate_metadata(info['title']),
            license=licenses.CC_BY_NC_SALicense(
                copyright_holder=truncate_metadata('Thoughtful Learning')),
            description=info['description'],
            language="en",
            derive_thumbnail=True,
            files=[files.YouTubeVideoFile(youtube_id)],
        )
        category_node.add_child(video_node)

    zip_path = create_predictable_zip(destination)
    app_node = nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        description=description,
        thumbnail=thumbnail_path,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

    category_node.add_child(app_node)
예제 #21
0
def scrape_page(exp_id, language, subject_node):
    # format to appropriate url depending on language
    my_downloader = downloader.ArchiveDownloader(EXPERIMENTS_FOLDER)
    url = format_url(exp_id, language)
    # page = downloader.archive_page(url, EXPERIMENTS_FOLDER)
    page = my_downloader.get_page(url, refresh=True)
    my_zip_dir = my_downloader.create_zip_dir_for_page(url)
    index_file = os.path.join(my_zip_dir, 'index.html')
    # entry = page['index_path']
    zip_path_entry = os.path.relpath(index_file,
                                     os.path.join('chefdata', 'experiments'))

    soup = BeautifulSoup(open(index_file, encoding='utf-8'), 'html.parser')

    # get title
    visible_SRAtitle = soup.find('h1', {'class': 'SRAtitle'})
    title = visible_SRAtitle.get_text(strip=True)

    # get tags
    visible_SRAtd = soup.findAll('div', {'class': 'SRAtd'})
    visible_tags = visible_SRAtd[-1]
    tags_arr = []
    for a_tags in visible_tags.findAll('a'):
        tag = a_tags.get_text(strip=True)
        # remove special characters
        tag = re.sub(r"[^a-zA-Z0-9]+", ' ', tag)
        # removing ending whitespace
        tag = tag.rstrip()
        tags_arr.append(tag)

    # remove navbar
    navbar = soup.find('nav')
    navbar.decompose()

    # remove footer
    footer = soup.find('footer')
    footer.decompose()

    # remove all hrefs
    for a_tag in soup.findAll('a'):
        del a_tag['href']
        # move all children of a tag to parent
        a_tag.replaceWithChildren()

    # write updated soup to html file
    soup_str = str(soup)
    # html_file = open(entry, 'w', encoding = 'utf-8')
    html_file = open(index_file, 'w', encoding='utf-8')
    html_file.write(soup_str)
    html_file.close()

    # zippath = zip.create_predictable_zip(EXPERIMENTS_FOLDER, zip_path_entry)
    zippath = zip.create_predictable_zip(my_zip_dir)
    # copy zippath to temp folder here if necessary
    shutil.copy(zippath, TEMP_FOLDER)
    html5_node = nodes.HTML5AppNode(
        source_id='{0}_{1}'.format(language, url),
        files=[files.HTMLZipFile(zippath)],
        title=title,
        description='',
        license=licenses.CC_BYLicense('Sciensation'),
        language=language,
        thumbnail=None,
        author='Sciensation',
        tags=tags_arr)
    subject_node.add_child(html5_node)
    return subject_node