def download_wikipedia_page(url, thumbnail, title): """ Create zip file to use for html pages """ destpath = tempfile.mkdtemp( ) # Create a temp directory to house our downloaded files # downlod the main wikipedia page, apply a middleware processor, and call it index.html localref, _ = download_file( url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page, ) zippath = create_predictable_zip( destpath) # Turn the temp folder into a zip file # Create an HTML5 app node html5app = nodes.HTML5AppNode( files=[files.HTMLZipFile(zippath)], title=title, thumbnail=thumbnail, source_id=url.split("/")[-1], license=CHANNEL_LICENSE, ) return html5app
def download_wikipedia_page(url, thumbnail, title): # create a temp directory to house our downloaded files destpath = tempfile.mkdtemp() # downlod the main wikipedia page, apply a middleware processor, and call it index.html localref, _ = download_file( url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page, request_fn=make_request, ) # turn the temp folder into a zip file zippath = create_predictable_zip(destpath) # create an HTML5 app node html5app = HTML5AppNode( files=[HTMLZipFile(zippath)], title=title, thumbnail=thumbnail, source_id=url.split("/")[-1], license=licenses.PublicDomainLicense(), ) return html5app
def transform_html(content): """ Transform the HTML markup taken from `content` (str) to file index.html in a standalone zip file. Return the neceesary metadata as a dict. """ chef_tmp_dir = 'chefdata/tmp' webroot = tempfile.mkdtemp(dir=chef_tmp_dir) metadata = dict( kind='html_content', source_id=content[0:30], zippath=None, # to be set below ) doc = BeautifulSoup(content, 'html5lib') meta = Tag(name='meta', attrs={'charset': 'utf-8'}) doc.head.append(meta) # TODO: add meta language (in case of right-to-left languages) # Writeout new index.html indexhtmlpath = os.path.join(webroot, 'index.html') with open(indexhtmlpath, 'w') as indexfilewrite: indexfilewrite.write(str(doc)) # Zip it localize_image_refs(webroot) zippath = create_predictable_zip(webroot) metadata['zippath'] = zippath return metadata
def transform_hpstoryline_folder(contentdir, story_id, node): """ Package the contents of the folder of kind `hpstoryline` called `story_id` located in the directory `contentdir` and return the neceesary metadata as a dict. """ sourcedir = os.path.join(contentdir, story_id) webroot = os.path.join(contentdir, story_id + '_webroot') # transformed dir if not os.path.exists(sourcedir): print('WWW Could not find local resource folder for story_id=', story_id) return None if os.path.exists(webroot): shutil.rmtree(webroot) # Copy source dir to webroot dir where we'll do the edits and transformations shutil.copytree(sourcedir, webroot) metadata = dict( kind='hpstoryline', title_en=node['title'], source_id=story_id, thumbnail=None, # TODO zippath=None, # will be set below ) # Zip it localize_image_refs(webroot) zippath = create_predictable_zip(webroot) metadata['zippath'] = zippath return metadata
def download_puzzle(puzzle_url, title, description, thumbnail, le_language_code, blockly_language_code): """Download a single puzzle and return an HTML5 app node.""" with WebDriver("https://blockly-games.appspot.com/%s" % puzzle_url, delay=1000) as driver: doc = BeautifulSoup(driver.page_source, "html.parser") # Create a temporary folder to download all the files for a puzzle. destination = tempfile.mkdtemp() # Download all the JS/CSS/images/audio/etc we can get from scraping the # page source. doc = download_static_assets(doc, destination, 'https://blockly-games.appspot.com', request_fn=make_request, url_blacklist=['analytics.js']) # Download other files not picked up by the above generic assets fetching, # e.g. from GitHub. puzzle_name = puzzle_url.split('?')[0] download_additional_assets(destination, puzzle_name) # Make some modifications to the HTML source -- hide some elements. remove_node(doc, '#languageMenu') remove_node(doc, '#title') # Copy over some of our own JS/CSS files and then add links to them in the # page source. copy_tree("static", os.path.join(destination, "static")) chef_body_script = doc.new_tag("script", src="static/chef_end_of_body.js") doc.select_one('body').append(chef_body_script) chef_head_script = doc.new_tag("script") chef_head_script.string = 'window["BlocklyGamesLang"] = "%s";' % blockly_language_code doc.select_one('head').insert(0, chef_head_script) # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print( " Downloaded puzzle %s titled \"%s\" (thumbnail %s) to destination %s" % (puzzle_url, title, thumbnail, destination)) # preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=puzzle_url, title=truncate_metadata(title), description=description, license=licenses.PublicDomainLicense(copyright_holder='Google'), thumbnail=thumbnail, files=[files.HTMLZipFile(zip_path)], language=le_language_code, )
def download_writing_topic_category(category_doc, title, level_id): destination = tempfile.mkdtemp() # Download a font font_url = make_fully_qualified_url( '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic' ) download_file(font_url, destination, request_fn=make_request, filename='roboto.css') # Write out the HTML source, based on CSS formatting from # https://k12.thoughtfullearning.com/resources/writingtopics topics = (("<li>%s</li>" % topic.text) for topic in category_doc.select('.views-row')) html_source = """ <!DOCTYPE html> <head> <link href='roboto.css' rel='stylesheet' type='text/css'> <style> ul { margin: 0 0 0 40px; padding: 0; } li { font-family: "Roboto", sans-serif; font-weight: 300; font-size: 19.2px; line-height: 24.96px; color: #202020; margin-top: 10px; } </style> </head> <body> <ul>%s</ul> </body> """ % ''.join(topics) with open(os.path.join(destination, "index.html"), "w") as f: f.write(html_source) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id="%s|%s" % (level_id, title), title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), files=[files.HTMLZipFile(zip_path)], language="en", thumbnail=writing_topic_thumbnail, )
def construct_channel(self, *args, **kwargs): channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info lang_names = list(self.data.keys()) lang_names.sort() for lang_name in lang_names: lang_data = self.data[lang_name] LOGGER.info("Creating app for language: {}".format(lang_name)) lang = languages.getlang_by_native_name(lang_name) zip_dir = self.client.create_zip_dir_for_page(lang_data['url']) soup = self.client.get_page_soup(lang_data['url']) # Remove the translation list if found translations = soup.find('div', {'id': 'translations'}) if translations: translations.extract() # Grab the localized title title = soup.find('span', {'id': 'share_title'}).text # Save the modified index.html page thumbnail = None for resource in lang_data['resources']: if 'dp3t.png' in resource: thumbnail = os.path.join(zip_dir, resource) break with open(os.path.join(zip_dir, 'index.html'), 'wb') as f: f.write(soup.prettify(encoding='utf-8')) # create_predictable_zip ensures that the ZIP file does not change each time it's created. This # ensures that the zip doesn't get re-uploaded just because zip metadata changed. zip_file = zip.create_predictable_zip(zip_dir) zip_name = lang.primary_code if lang else lang_name zip_filename = os.path.join(self.ZIP_DIR, "{}.zip".format(zip_name)) os.makedirs(os.path.dirname(zip_filename), exist_ok=True) os.rename(zip_file, zip_filename) topic = nodes.TopicNode(source_id=lang_name, title=lang_name) zip_node = nodes.HTML5AppNode( source_id="covid19-sim-{}".format(lang_name), title=title, files=[files.HTMLZipFile(zip_filename)], license=licenses.PublicDomainLicense( "Marcel Salathé & Nicky Case"), language=lang, thumbnail=thumbnail) topic.add_child(zip_node) channel.add_child(topic) return channel
def _scrape_story_html5(self, story): url = story['url'] page = self._html.get(url) story_section = page.find('section', id='section-main') links_section = story_section.find('div', class_='languages-links') # Is there a way to cross link HTML5AppNode? if links_section: links_section.extract() title = self.__get_text(story_section.find('h1', class_='page-header')) language_code = self.__get_language_code(story['language']) dest_path = tempfile.mkdtemp(dir=NalibaliChef.ZIP_FILES_TMP_DIR) for img in story_section.find_all('img'): self._scrape_download_image(dest_path, img) basic_page_str = """ <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title></title> </head> <body> </body> </html>""" basic_page = BeautifulSoup(basic_page_str, "html.parser") body = basic_page.find('body') body.append(story_section) with open(os.path.join(dest_path, 'index.html'), 'w', encoding="utf8") as index_html: index_html.write(str(basic_page)) zip_path = create_predictable_zip(dest_path) parsed_story_url = urlparse(url) return dict( kind=content_kinds.HTML5, source_id=parsed_story_url.path if parsed_story_url else url, title=title, language=language_code, description=story['description'], license=NalibaliChef.LICENSE, thumbnail=story['thumbnail'], files=[ dict( file_type=content_kinds.HTML5, path=zip_path, language=language_code, ) ], )
def create_zip_from_dir(self, dir_to_zip): """ Adds all the files and subfolders from dir_to_zip into a Kolibri-compatible zip file. :param dir_to_zip: Directory containing files to zip. :return: Path to zip file. Note that this file is stored in the temp dir and will not persist across runs. """ temp_zip = zip.create_predictable_zip(dir_to_zip) zip_hash = files.get_hash(temp_zip) zip_dir = os.path.join(self.cache_dir, 'zips') if not os.path.exists(zip_dir): os.makedirs(zip_dir) output_zip = os.path.join(zip_dir, '{}.zip'.format(zip_hash)) os.rename(temp_zip, output_zip) return output_zip
def test_create_many_predictable_zip_files(ndirs=8193): """ Regression test for `OSError: [Errno 24] Too many open files` when using ricecooker.utils.zip.create_predictable_zip helper method: https://github.com/learningequality/ricecooker/issues/185 Run `ulimit -a` to see the limits for # open files on your system and set ndirs to higher number to use this test. Also comment out the @pytest.mark.skip """ zip_paths = [] for _ in range(0, ndirs): inputdir = tempfile.mkdtemp() with open(os.path.join(inputdir, 'index.html'), 'w') as testf: testf.write('something something') zip_path = create_predictable_zip(inputdir) zip_paths.append(zip_path) assert len(zip_paths) == ndirs, 'wrong number of zip files created'
def create_html5_app_node(license, content_dict, ims_dir, scraper_class=None, temp_dir=None, needs_scorm_support=False): if scraper_class: index_path = os.path.join(ims_dir, content_dict['index_file']) if '?' in index_path: index_path = index_path.split('?')[0] if '#' in index_path: index_path = index_path.split('#')[0] if content_dict['scormtype'] == 'sco' and needs_scorm_support: add_scorm_support(index_path, ims_dir) index_uri = pathlib.Path(os.path.abspath(index_path)).as_uri() zip_name = '%s.zip' % hashlib.md5( index_uri.encode('utf-8')).hexdigest() temp_dir = temp_dir if temp_dir else tempfile.gettempdir() zip_path = os.path.join(temp_dir, zip_name) scraper = scraper_class(index_uri) scraper.download_file(zip_path) logging.info('Webmixer scraper outputted HTML app to %s' % zip_path) else: with tempfile.TemporaryDirectory() as destination: index_src_path = os.path.join(ims_dir, content_dict['index_file']) index_dest_path = os.path.join(destination, 'index.html') shutil.copyfile(index_src_path, index_dest_path) for file_path in content_dict['files']: shutil.copy(os.path.join(ims_dir, file_path), destination) if content_dict.get('scormtype') == 'sco' and needs_scorm_support: add_scorm_support(index_dest_path, destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=content_dict['identifier'], title=content_dict.get('title'), license=license, files=[files.HTMLZipFile(zip_path)], )
def create_html5_app_node(license, content_dict): with tempfile.TemporaryDirectory() as destination: index_copy_path = os.path.join(destination, 'index.html') shutil.copyfile(content_dict['index_file'], index_copy_path) for file_path in content_dict['files']: shutil.copy(file_path, destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=content_dict['identifier'], title=content_dict.get('title'), license=license, files=[files.HTMLZipFile(zip_path)], )
def get_phet_zip_file(zip_file_url, main_file_and_query): """ Phet simulations are provided in the zip file `phet.zip`, and the entry point is passed as a GET parameter in `main_file_and_query`. To make these compatible with Kolibri's default behaviour of loading index.html, we will: - Rename index.html to phetindex.thml - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id} """ u = urlparse(main_file_and_query) idk, sim_id = u.query.split('=') assert idk == 'id', 'unknown query sting format found' + main_file_and_query main_file = u.scheme + '://' + u.netloc + u.path # skip querystring destpath = tempfile.mkdtemp() LOGGER.info('saving phet zip file in dir ' + destpath) try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # Rename main_file to phetindex.html main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'phetindex.html') os.rename(src, dest) # Create the index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id) with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf: indexf.write(index_html) # Always be zipping! return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file_and_query, destpath, e)) return None
def download_content_node(url, title): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'http://migranthealth.eu/', request_fn=make_request, url_blacklist=url_blacklist, derive_filename=derive_filename) nodes_to_remove = [ 'header', '#page-top-header', '#block-region-side-pre', '#region-main .row-fluid .span4.heading-rts', '.readmoreLinks', '.courseSectionNext', 'img[alt="next"]', '.modified', '.footer-rts', '#page-footer', '.back-to-top', '.skiplinks', '.linkicon', '.generalbox table tr:nth-of-type(2)', ] for selector in nodes_to_remove: for node in doc.select(selector): node.decompose() # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=MEET_LICENSE, files=[files.HTMLZipFile(zip_path)], language="en", )
def scrape_content_page(content_page_url, lang): """ Download standalone HTML content pages (non-modules). Used for "Curriculum framework" and standalone pages in "Resources". Returns: page_info (dict): info necessary to constructing HTML5AppNode and HTMLZipFile - title - source_id - description - zip_path """ LOGGER.debug('Scrapring content page @ url = ' + str(content_page_url)) doc = get_parsed_html_from_url(content_page_url) destination = tempfile.mkdtemp() print('destination=', destination) source_id = parse_qs(urlparse(content_page_url).query)['id'][0] raw_title = doc.select_one("head title").text content_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() page_info = dict( lang=lang, source_id=source_id, title=content_title, description=None, children=[], ) # Do the actual download download_page(content_page_url, destination, 'index.html', lang) # zip it page_info['zip_path'] = create_predictable_zip(destination) # ship it return page_info
def download_wikipedia_page(url, title, writer, thumbnail=None): """ Create zip file to use for html pages """ destpath = tempfile.mkdtemp( ) # Create a temp directory to house our downloaded files # Generate details for files details = { 'thumbnail': thumbnail, 'source_id': url.split("/")[-1], 'license': CHANNEL_LICENSE, } # Download the main wikipedia page, apply middleware processor, and call it index.html localref, _ = download_file(url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page) zippath = create_predictable_zip( destpath) # Turn the temp folder into a zip file writer.add_file(str(PATH), title, zippath, **details)
def package_html_content_as_html5_zip_file(html): """ Transform the HTML markup in `html["content"]` (str) to file index.html in a standalone zip file. Return the neceesary metadata as a dict. """ chef_tmp_dir = 'chefdata/tmp' webroot = tempfile.mkdtemp(dir=chef_tmp_dir) content = html['content'] doc = BeautifulSoup(content, 'html5lib') meta = Tag(name='meta', attrs={'charset': 'utf-8'}) doc.head.append(meta) # TODO: add meta language (in case of right-to-left languages) # Writeout new index.html indexhtmlpath = os.path.join(webroot, 'index.html') with open(indexhtmlpath, 'w') as indexfilewrite: indexfilewrite.write(str(doc)) # Zip it zippath = create_predictable_zip(webroot) return zippath
def get_zip_file(zip_file_url, main_file): """HTML games are provided as zip files, the entry point of the game is main_file. main_file needs to be renamed to index.html to make it compatible with Kolibri. """ destpath = tempfile.mkdtemp() try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # In some cases, the files are under the www directory, # let's move them up one level. www_dir = os.path.join(zip_folder, 'www') if os.path.isdir(www_dir): files = os.listdir(www_dir) for f in files: shutil.move(os.path.join(www_dir, f), zip_folder) # Rename main_file to index.html. main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'index.html') os.rename(src, dest) return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file, destpath, e)) return None
def get_content_zip(self, page): """ Get the zip path of the content. """ # Find the zip url of the content and check if it's valid. zip_href = page.find("a", href=re.compile(".zip")) if not zip_href: return None zip_url = "http://proyectodescartes.org{}".format(zip_href["href"]) zip_resp = downloader.make_request(zip_url) if zip_resp.status_code != 200: return None filepath = "/tmp/{}".format(zip_url.split("/")[-1]) with open(filepath, "wb") as f: f.write(zip_resp.content) dst = tempfile.mkdtemp() html_name = page.find( "div", class_="itemFullText").find("a")["href"].split("/")[-1] # Unzip the downloaded zip file and zip the folder again. In case that # index.html does not exist on the top most level, rename the index page # in the folder to index.html before zipping the folder again. with zipfile.ZipFile(filepath) as zf: extracted_src = unquote(filepath.split("/")[-1].split(".zip")[0]) zf.extractall(dst) if html_name != "index.html": src_index = os.path.join(dst, extracted_src, html_name) dst_index = src_index.replace(html_name, "index.html") if os.path.exists(src_index): os.rename(src_index, dst_index) zip_path = create_predictable_zip(os.path.join(dst, extracted_src)) return zip_path
def get_zip_file(zip_file_url, main_file): """ HTML games are provided as zip files, the entry point of the game is `main_file`. THe `main_file` needs to be renamed to index.html to make it compatible with Kolibri. """ key = zip_file_url + main_file destpath = make_temporary_dir_from_key(key) # Check for "REPLACE WITH:" correction rule for the current `zip_file_url` replacement_url = should_replace_with(zip_file_url) if replacement_url: zip_file_url = replacement_url # return cached version if already there final_webroot_path = os.path.join(destpath, 'webroot.zip') if os.path.exists(final_webroot_path): return final_webroot_path try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] # e.g. Mathematics.zip zip_basename = zip_filename.rsplit('.', 1)[0] # e.g. Mathematics/ # July 31: handle ednge cases where zip filename doesn't match folder name inside it awazchitras = [ 'Awazchitra_HI', 'Awazchitra_TL', 'Awazchitra_KN', 'Awazchitra_BN', 'Awazchitra_OD', 'Awazchitra_PN', 'Awazchitra_TM' ] for awazchitra in awazchitras: if awazchitra in zip_basename: zip_basename = zip_basename.replace('Awazchitra', 'AwazChitra') if '_KKS_Hi' in zip_basename: zip_basename = zip_basename.replace('_KKS_Hi', '_KKS_HI') # Mar 2: more edge cases where zip filename doesn't match folder name inside it if 'Memorygamekb' in zip_basename: zip_basename = zip_basename.replace('Memorygamekb', 'MemoryGamekb') if 'cityofstories' in zip_basename: zip_basename = zip_basename.replace('cityofstories', 'CityOfStories') # Jun 12: fix more edge cases where .zip filename doesn't match dir name if '_KKS_Gj' in zip_basename: zip_basename = zip_basename.replace('_KKS_Gj', '_KKS_GJ') if 'ShabdKhel' in zip_basename: zip_basename = zip_basename.replace('ShabdKhel', 'Shabdkhel') zip_folder = os.path.join(destpath, zip_basename) # e.g. destpath/Mathematics/ main_file = main_file.split('/')[ -1] # e.g. activity_name.html or index.html if 'KhelbadiKahaniyan_MR' in zip_basename: # Inconsistency --- `main_file` contains dir name, and not index.html main_file = 'index.html' # Jul 8th: handle weird case-insensitive webserver main_file if main_file == 'mainexpand.html': main_file = 'mainExpand.html' # <-- this is the actual filename in the zip # Zip files from Pratham website have the web content inside subfolder # of the same as the zip filename. We need to recreate these zip files # to make sure the index.html is in the root of the zip. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: # If main_file is in the root (like zips from the game repository) # then we need to extract the zip contents to subfolder zip_basename/ for zfileinfo in zf.filelist: if zfileinfo.filename == main_file: destpath = os.path.join(destpath, zip_basename) # Extract zip so main file will be in destpath/zip_basename/index.html zf.extractall(destpath) # In some cases, the files are under the www directory, # let's move them up one level. www_dir = os.path.join(zip_folder, 'www') if os.path.isdir(www_dir): files = os.listdir(www_dir) for f in files: shutil.move(os.path.join(www_dir, f), zip_folder) # Rename `main_file` to index.html src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'index.html') os.rename(src, dest) # Logic to add margin-top:44px; for games that match Corrections tab add_margin_top = False for row in PRADIGI_CORRECTIONS_LIST: if row[CORRECTIONS_ACTION_KEY] == ADD_MARGIN_TOP_ACTION: pat = row[CORRECTIONS_SOURCE_URL_PAT_KEY] m = pat.match(zip_file_url) if m: add_margin_top = True if add_margin_top: if zip_file_url.endswith('CourseContent/Games/Mathematics.zip'): LOGGER.info( "adding body.margin-top:44px; to ALL .html files in: %s" % zip_file_url) for root, dirs, files in os.walk(zip_folder): for file in files: if file.endswith(".html"): add_body_margin_top(root, file) else: LOGGER.info( "adding body.margin-top:44px; to index.html in: %s" % zip_file_url) add_body_margin_top(zip_folder, 'index.html') # Replace occurences of `main_file` with index.html to avoid broken links for root, dirs, files in os.walk(zip_folder): for file in files: if file.endswith(".html") or file.endswith(".js"): file_path = os.path.join(root, file) # use bytes to avoid Unicode errors "invalid start/continuation byte" bytes_in = open(file_path, 'rb').read() bytes_out = bytes_in.replace(main_file.encode('utf-8'), b'index.html') open(file_path, 'wb').write(bytes_out) # create the zip file and copy it to tmp_predictable_zip_path = create_predictable_zip(zip_folder) shutil.copyfile(tmp_predictable_zip_path, final_webroot_path) return final_webroot_path except Exception as e: LOGGER.error("get_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file, destpath, e)) return None
def download_content_node(category_node, url, title, thumbnail=None, description=None): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'https://k12.thoughtfullearning.com', request_fn=make_request, url_blacklist=url_blacklist) remove_node(doc, '#header') remove_node(doc, '.subMenuBarContainer') remove_node(doc, '.breadbookmarkcontainer') remove_node(doc, '.resourcePageTypeTitle') remove_node(doc, '.sharethis-wrapper') remove_node(doc, '.ccBlock') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block') remove_node(doc, '.productSuggestionContainer') remove_node(doc, 'footer') # For minilessons remove_node(doc, '.field-name-field-minilesson-downloadables') # For writing assessments remove_node(doc, '.assessmentTGLink') remove_node(doc, '.assessmentModelRubrics') remove_node(doc, '.view-display-id-attachment_1') # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) thumbnail_path = None if thumbnail: # Manually download the thumbnail and use it so we can lowercase the # extension to be accepted by Ricecooker. thumbnail_filename = derive_filename(thumbnail) thumbnail_path = os.path.join(destination, thumbnail_filename) download_file(thumbnail, destination, request_fn=make_request, filename=thumbnail_filename) # If there is an embedded video in the page source grab it as a video node. video_node = None iframe = doc.select_one('.embedded-video iframe') if iframe: youtube_url = iframe['src'] youtube_id = get_youtube_id_from_url(youtube_url) info = ydl.extract_info(youtube_url, download=False) video_title = info['title'] print( " ... and with video titled %s from www.youtube.com/watch?v=%s" % (video_title, youtube_id)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(info['title']), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=info['description'], language="en", derive_thumbnail=True, files=[files.YouTubeVideoFile(youtube_id)], ) category_node.add_child(video_node) zip_path = create_predictable_zip(destination) app_node = nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=description, thumbnail=thumbnail_path, files=[files.HTMLZipFile(zip_path)], language="en", ) category_node.add_child(app_node)
def download_module(module_url, lang=None): LOGGER.debug('Scrapring module @ url = ' + module_url) doc = get_parsed_html_from_url(module_url) source_id = parse_qs(urlparse(module_url).query)['id'][0] raw_title = doc.select_one("head title").text module_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() module_contents_dict = dict( kind='TessaModuleContentsDict', lang=lang, source_id=source_id, title=module_title, children=[], ) # TRY TO CREATE MODULE TOC SIDEBAR MENU ############################################################################ current_li_deep = doc.find('li', class_='oucontent-tree-current') # Sept 5th: special treatement for modules with no TOC in sidebar if current_li_deep is None: return download_module_no_toc(module_url, lang=lang) # CREATE MODULE TOC SIDEBAR MENU # July 28 HACK : infer module_toc_li using marker on sublist-li ############################################################################ destination = tempfile.mkdtemp() print('destination=', destination) # copy css/js/images from skel shutil.copytree('chefdata/templates/module_skel/styles', os.path.join(destination, 'styles')) is_first_section = True module_toc_li = current_li_deep.find_parent('li', class_='item-section') # print(module_toc_li.prettify()) # module_contents_div = module_toc_li.find('div', class_='oucontent-contents') outer_module_ul = module_toc_li.find('ul', class_='child-item-list', recursive=False) inner_module_ul = outer_module_ul.find( 'div', class_='oucontent-contents').find('ul', recursive=False) section_lis = inner_module_ul.find_all('li', recursive=False) print(len(section_lis)) # DETECT IF SIMPLE MODULE (single page, so sections) OR COMPLEX MODULE (with sections) if len(section_lis) == 0: print('UNEXPECTED -------- len(section_lis) == 0') print(module_url, '<<< <<< ' * 6) if len(section_lis) == 1: is_simple_module = True else: is_simple_module = False # SIMPLE MODULES THAT CONSIST OF A SINGLE PAGE -- becomes index.html if is_simple_module: section_li = section_lis[0] # print('*'*120) # print(section_li.prettify()) section_title_span = section_li.find('span', class_='oucontent-tree-item') section_title = get_text(section_title_span) print('Processing simple module:', section_title) section_dict = dict( kind='TessaModuleContentsSection', title=section_title, href=module_url, filename='index.html', # TODO: figure out if this is necessary children=[], ) # print(' section:', section_title) module_contents_dict['children'].append(section_dict) subsections_ul = section_li.find('ul', recursive=False) if subsections_ul: pass #print('found some subsections...') else: pass #print('no subsections <ul> found in this section') download_page(module_url, destination, 'index.html', lang) # /SIMPLE MODULE # COMPLEX MODULES WITH SECTIONS AND custom-made TOC in index.html else: for section_li in section_lis: if 'download individual sections' in get_text( section_li): # TODO: AR, SW, FR print( 'skipping section "Read or download individual sections..." ' ) continue # print(section_li.prettify()) # print('>'*80) section_title_span = section_li.find('span', class_='oucontent-tree-item') if section_title_span: if section_title_span.find('span', class_='current-title'): section_href = module_url else: section_a = section_title_span.find('a') if section_a: section_href = section_a['href'] else: section_href = '#NOLINK' # for sections like "Top 20 ideas for teaching large classes" else: section_href = '#NOLINK' # for sections like "Read or download individual sections of the m..." # special case for first section --- since it doesn't save section in filename # manually call download_page with filename section_1.html with contents of current page if is_first_section: section_filename = 'section-1.html' is_first_section = False else: if '#NOLINK' not in section_href: section_filename = get_section_filename(section_href) # accesshide_span = section_title_span.find('span', class_='accesshide') # if accesshide_span: # accesshide_span.extract() # subsections_ul.extract() section_title = get_text(section_title_span) section_dict = dict( kind='TessaModuleContentsSection', title=section_title, href=section_href, filename=section_filename, children=[], ) # print(' section:', section_title) module_contents_dict['children'].append(section_dict) subsections_ul = section_li.find('ul', recursive=False) if subsections_ul: subsection_lis = subsections_ul.find_all('li') for subsection_li in subsection_lis: # print('<'*100) # print(subsection_li) #print('>>>>>') #print(subsection_li.prettify()) subsection_link = subsection_li.find('a') if not subsection_link: # handle wrird LOGGER.warning('((((( Skipping section ' + subsection_li.get_text() + ' because no subsection_link') continue subsection_href = subsection_link['href'] subsection_filename = get_section_filename(subsection_href) # subaccesshide_span = subsection_li.find('span', class_='accesshide') # if subaccesshide_span: # subaccesshide_span.extract() subsection_title = get_text(subsection_li) subsection_dict = dict( kind='TessaModuleContentsSubsection', title=subsection_title, href=subsection_href, filename=subsection_filename, ) # print(' subsection:', subsection_title) section_dict['children'].append(subsection_dict) else: print('no subsections <ul> found in this section') module_index_tmpl = jinja2.Template( open('chefdata/templates/module_index.html').read()) index_contents = module_index_tmpl.render(module=module_contents_dict) with open(os.path.join(destination, "index.html"), "w") as f: f.write(index_contents) # download the html content from each section/subsection for section in module_contents_dict['children']: if '#NOLINK' in section['href']: print('nothing to download for #NOLINK section') continue download_section(section['href'], destination, section['filename'], lang) for subsection in section['children']: if '#NOLINK' in subsection['href']: print('nothing to download for #NOLINK subsection') continue download_section(subsection['href'], destination, subsection['filename'], lang) # /COMPLEX MODULE zip_path = create_predictable_zip(destination) return zip_path
def download_module_no_toc(module_url, lang=None): """ Extracting the module table of contents from the sidebad nav doesn't work for certain modules in FR e.g. http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=105334§ion=1.1 If NO TOC is available, then we'll crawl pages one by one (`module_contents_dict`) """ LOGGER.debug('Scrapring module @ url = ' + str(module_url)) doc = get_parsed_html_from_url(module_url) destination = tempfile.mkdtemp() print('destination=', destination) # copy css/js/images from skel shutil.copytree('chefdata/templates/module_skel/styles', os.path.join(destination, 'styles')) source_id = parse_qs(urlparse(module_url).query)['id'][0] raw_title = doc.select_one("head title").text module_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() module_contents_dict = dict( kind='TessaModuleContentsDict', source_id=source_id, title=module_title, lang=lang, children=[], ) # print(module_contents_dict) # recusively download all sections by following "Next" links current_url = module_url current_section = None is_first_section = True while True: LOGGER.debug('processing current_url' + str(current_url)) current_doc = get_parsed_html_from_url(current_url) # special handling for module-level page (no section in url but is really Section 1) if is_first_section: section_filename = 'section-1.html' is_first_section = False else: section_filename = get_section_filename(current_url) # Do the actual download download_section(current_url, destination, section_filename, lang) # Store section/subsecito info so we can build TOC later doc = get_parsed_html_from_url(current_url) raw_title = doc.select_one("head title").text the_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() # sections e.g. section-3.html if '_' not in section_filename: section_dict = dict(kind='TessaModuleContentsSection', title=the_title, href=current_url, filename=section_filename, children=[]) module_contents_dict['children'].append(section_dict) print(' - section:', the_title[0:80]) current_section = section_dict # subsections e.g. section-3_2.html else: subsection_title = the_title.replace(module_title, '') subsection_title.replace(current_section['title'], '') subsection_title = subsection_title.lstrip() if subsection_title.startswith(': '): subsection_title = subsection_title.replace(': ', '', 1) subsection_dict = dict( kind='TessaModuleContentsSubsection', title=subsection_title, href=current_url, filename=section_filename, ) print(' - subsection:', subsection_title[0:80]) current_section['children'].append(subsection_dict) # Recurse if next next_url = _get_next_section_url(current_doc) if next_url: current_url = next_url else: break # for debugging... # pp.pprint(module_contents_dict) module_index_tmpl = jinja2.Template( open('chefdata/templates/module_index.html').read()) index_contents = module_index_tmpl.render(module=module_contents_dict) with open(os.path.join(destination, "index.html"), "w") as f: f.write(index_contents) # return module_contents_dict zip_path = create_predictable_zip(destination) return zip_path
def transform_articulate_storyline_folder(contentdir, activity_ref): """ Transform the contents of the folder of kind `articulate_storyline` called `activity_ref` located in the directory `contentdir` to adapt it to Kolibri plarform, package it as a zip, and return the neceesary metadata as a dict. """ sourcedir = os.path.join(contentdir, activity_ref) # source folder webroot = os.path.join(contentdir, activity_ref + '_webroot') # transformed dir if not os.path.exists(sourcedir): print('WWW Could not find local resource folder for activity_ref=', activity_ref) return None if os.path.exists(webroot): shutil.rmtree(webroot) # Copy source dir to webroot dir where we'll do the edits and transformations shutil.copytree(sourcedir, webroot) # Remove unnecessary files html_files_to_remove = ['story.html', 'story.swf', 'story_flash.html'] for html_file in html_files_to_remove: filepath = os.path.join(webroot, html_file) if os.path.exists(filepath): os.remove(filepath) # Remove all .swf files from webroot/ for root, dirs, files in os.walk(webroot): for file in files: filepath = os.path.join(root, file) _, ext = os.path.splitext(filepath) if ext == '.swf': os.remove(filepath) metapath = os.path.join(webroot, 'meta.xml') metaxml = open(metapath, 'r').read() metadoc = BeautifulSoup(metaxml, "html5lib") project = metadoc.find('project') # TODO: get author from project > <author name="Victoria" email="" website="" /> metadata = dict( kind='articulate_storyline', title_en=project['title'], source_id=activity_ref, thumbnail=os.path.join(webroot, project.attrs['thumburl']), datepublished=project['datepublished'], duration=project['duration'], totalaudio=project['totalaudio'], zippath=None, # to be set below ) # Setup index.html indexhtmlpath = os.path.join(webroot, 'index.html') shutil.move(os.path.join(webroot, 'story_html5.html'), indexhtmlpath) # load index.html with open(indexhtmlpath, 'r') as indexfileread: indexhtml = indexfileread.read() doc = BeautifulSoup(indexhtml, 'html5lib') # A. Localize js libs in <HEAD> scriptsdir = os.path.join(webroot, 'scripts') if not os.path.exists(scriptsdir): os.mkdir(scriptsdir) headscripts = doc.find('head').find_all('script') for script in headscripts: script_url = script['src'] script_basename = os.path.basename(script_url) response = requests.get(script_url, verify=False) with open(os.path.join(scriptsdir, script_basename), 'wb') as scriptfile: scriptfile.write(response.content) scriptrelpath = os.path.join('scripts', script_basename) script['src'] = scriptrelpath # B. Inline css files to avoid CORS issues styles = doc.find('body').find_all('link', rel="stylesheet") for style in styles: style_href = style['href'] style_path = os.path.join(webroot, style_href) if not os.path.exists(style_path) and 'min.css' in style_path: style_path = style_path.replace('min.css', 'css') style_content = '\n' + open(style_path).read() inline_style_tag = doc.new_tag('style') inline_style_tag['data-noprefix'] = '' inline_style_tag['rel'] = 'stylesheet' inline_style_tag.string = style_content style.replace_with(inline_style_tag) # C. Ensure that js files exist (rewrite app.min.js --> app.js if needed) bodyscripts = doc.find('body').find_all('script') for script in bodyscripts: if script.has_attr('src'): script_src = script['src'] script_path = os.path.join(webroot, script_src) if not os.path.exists(script_path) and 'min.js' in script_path: new_script_path = script_src.replace('min.js', 'js') script['src'] = new_script_path print(' replaced script_src', script_src, 'with new_script_path', new_script_path) # Save modified index.html with open(indexhtmlpath, 'w') as indexfilewrite: indexfilewrite.write(str(doc)) # Zip it localize_image_refs(webroot) zippath = create_predictable_zip(webroot) metadata['zippath'] = zippath return metadata
def scrape_content(title, content_url): """ title: Boys' clothing content_url: http://www.touchableearth.org/china-culture-boys-clothing/ """ print(" Scraping content node: %s (%s)" % (title, content_url)) doc = get_parsed_html_from_url(content_url) if not doc: # 404 return None description = create_description(doc) source_id = doc.select_one(".current_post.active .post_id")["value"] base_node_attributes = { "source_id": source_id, "title": title, "license": TE_LICENSE, "description": description, } youtube_iframe = doc.select_one(".video-container iframe") if youtube_iframe: youtube_url = doc.select_one(".video-container iframe")["src"] youtube_id = get_youtube_id_from_url(youtube_url) if not youtube_id: print(" *** WARNING: youtube_id not found for content url", content_url) print(" Skipping.") return None try: info = ydl.extract_info(youtube_url, download=False) subtitles = info.get("subtitles") subtitle_languages = subtitles.keys() if subtitles else [] print(" ... with subtitles in languages:", subtitle_languages) except youtube_dl.DownloadError as e: # Some of the videos have been removed from the YouTube channel -- # skip creating content nodes for them entirely so they don't show up # as non-loadable videos in Kolibri. print(" NOTE: Skipping video download due to error: ", e) return None video_node = nodes.VideoNode( **base_node_attributes, derive_thumbnail=True, files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) return video_node img = doc.select_one(".uncode-single-media-wrapper img") if img: img_src = img["data-guid"] or img["src"] destination = tempfile.mkdtemp() download_file(img_src, destination, request_fn=make_request, filename="image.jpg") with open(os.path.join(destination, "index.html"), "w") as f: f.write(""" <!doctype html> <html> <head></head> <body> <img src="image.jpg" style="width: 100%; max-width: 1200px;" /> </body> </html> """) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( **base_node_attributes, files=[files.HTMLZipFile(zip_path)], thumbnail=img_src, ) return None
def cool(self): self.zipname = create_predictable_zip(str(TEMP_FOUNDRY_ZIP))
def process_node_from_doc(doc, book_id, title, thumbnail): """ Create a Ricecooker HTML5AppNode instance given the HTML source and metadata. """ if DOWNLOAD_ONE_TO_webroot: # Save the book's contents to the folder `webroot` in the chef root dir. # Use the script ./ricecooker/utils/kolibripreview.py to preview in K destination = './webroot' if os.path.exists(destination): shutil.rmtree(destination) os.mkdir(destination) else: # Create a temporary folder to download all the files for a book destination = tempfile.mkdtemp() # Ensure the thumbnail is in a format Ricecooker can accept, and if not, # use the first slide as the thumbnail. thumbnail_extensions = ('jpg', 'jpeg', 'png') if not thumbnail.lower().endswith(thumbnail_extensions): print("Thumbnail src (%s) doesn't end in any of %s." " Will use the first slide as the source." % ( thumbnail, thumbnail_extensions)) first_slide_src = doc.select_one('#slide-container .slide img')['src'] thumbnail = make_fully_qualified_url(first_slide_src) if not thumbnail.lower().endswith(thumbnail_extensions): thumbnail = None # Download all the JS/CSS/images/audio/et needed to make a standalone app doc = download_static_assets(doc, destination) # Remove a bunch of HTML that we don't want showing in our standalone app doc.select_one('base')['href'] = '' remove_node(doc, '#loading') remove_node(doc, '#finishedActions') remove_node(doc, '.bookmarkbtn') remove_node(doc, '.reader-expand') remove_node(doc, '#progressBar') remove_node(doc, '#androidNotification') remove_node(doc, '#exit') remove_node(doc, '#ttmenu') # Remove unnecessary scripts in the head for pat in tag_content_patterns_to_remove_in_head: remove_nodes_containing_pattern(doc, pat, parent_tag_name='head') for pat in tag_content_patterns_to_remove_in_body: remove_nodes_containing_pattern(doc, pat, parent_tag_name='body') for pat_start, pat_end in cut_start_end_patterns: remove_nodes_between_comments(doc, pat_start, pat_end, parent_tag_name='body') # Write out the HTML source with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print("Downloaded book %s titled \"%s\" (thumbnail %s) to destination %s" % ( book_id, title, thumbnail, destination)) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=book_id, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense(copyright_holder='3asafeer.com'), thumbnail=thumbnail, files=[files.HTMLZipFile(zip_path)], language="ar", )
def make_topic_tree_with_entrypoints(license, imscp_zip, imscp_dict, ims_dir, temp_dir=None, parent_id=None, node_options=None): """Return a TopicTree node from a dict of some subset of an IMSCP manifest. The actual IMSCP zip is marked as a dependency, and the zip loaded by Kolibri only contains an index.html file that redirects to the entrypoint defined in the manifest. This minimizes the additional content generated for Kolibri, and also allows us to support content where multiple content nodes have entrypoints defined by parameters, e.g. index.html#chapter2, index.html#chapter3, etc. Ready to be uploaded via Ricecooker to Studio or used in Kolibri. Args: license - License to apply to content nodes. imscp_dict - Dict of IMSCP from extract_from_zip or extract_from_dir. ims_dir (string) - Path of directory of IMSCP scraper_class (webmixer.HTMLPageScraper class, optional): Webmixer scraper class to use for pruning an HTML page. temp_dir (string, optional) - Full path of temporary directory to output HTML zip files to. parent_id (string, optional) - Parent ID string to concatenate to source ID. node_options (dict, optional) - Options to pass to content renderer in Kolibri. """ if not temp_dir: temp_dir = tempfile.tempdir source_id = imscp_dict['identifier'] assert source_id, "{} has no identifier, parent id = {}".format( os.path.basename(imscp_zip), parent_id) if parent_id: source_id = '{}-{}'.format(parent_id, source_id) if imscp_dict.get('children'): topic_node = nodes.TopicNode(source_id=source_id, title=imscp_dict['title']) counter = 1 for child in imscp_dict['children']: # We will get duplicate IDs if we don't have any ID set. if not child['identifier']: child['identifier'] = 'item{}'.format(counter) topic_node.add_child( make_topic_tree_with_entrypoints(license, imscp_zip, child, ims_dir, temp_dir=temp_dir, parent_id=source_id, node_options=node_options)) counter += 1 return topic_node else: if imscp_dict['type'] == 'webcontent': entrypoint_dir = os.path.join(temp_dir, 'entrypoint') if os.path.exists(entrypoint_dir): shutil.rmtree(entrypoint_dir) os.makedirs(entrypoint_dir) index = os.path.join(entrypoint_dir, "index.html") entrypoint_url = '/zipcontent/{}/{}'.format( os.path.basename(imscp_zip), imscp_dict['href']) f = open(index, "w", encoding="utf-8") f.write(ENTRYPOINT_TEMPLATE.format(entrypoint_url)) f.close() zip_path = create_predictable_zip(entrypoint_dir) html5_node = nodes.HTML5AppNode( source_id=source_id, title=imscp_dict.get('title'), license=license, files=[ files.HTMLZipFile(zip_path), files.HTMLZipFile( imscp_zip, preset=format_presets.HTML5_DEPENDENCY_ZIP) ], ) if node_options is not None: extra_data = {'options': node_options} html5_node.extra_fields.update(extra_data) return html5_node else: logging.warning('Content type %s not supported yet.' % imscp_dict['type'])
def modify_zip(self, scorm_zip): """ The SCORM modules we receive in some cases have graphics that reference UI elements that don't exist in Kolibri. This function modifies the zip to remove them and returns the modified zip. :param scorm_zip: The path to the original zip file. :return: Path to the modified zip file, if it exists. """ zip_dir_name = os.path.splitext(os.path.basename(scorm_zip))[0] zip_root = os.path.join(self.temp_dir, zip_dir_name) output_zip = os.path.join(self.temp_dir, 'out_zips', zip_dir_name) os.makedirs(zip_root, exist_ok=True) os.makedirs(os.path.dirname(output_zip), exist_ok=True) zip = zipfile.ZipFile(scorm_zip) zip.extractall(zip_root) zip_changed = False telas_end_sprites = os.path.join(zip_root, 'curso', 'telas', 'end', 'sprites.png') if os.path.exists(telas_end_sprites): LOGGER.debug("Deleting sprites at {}".format(telas_end_sprites)) os.remove(telas_end_sprites) zip_changed = True else: assert "n1_ted_len_en_u01_v02" not in scorm_zip, os.listdir(zip_root) for replace_img in self.replace_images: img_glob = glob.glob(os.path.join(zip_root, '**', replace_img), recursive=True) for img in img_glob: os.remove(img) shutil.copy(os.path.join(ROOT_DIR, 'assets', replace_img), img) if not replace_img in self.replaced_images: self.replaced_images.append(replace_img) zip_changed = True # make any HTML replacements replaced_imgs = [] for html_file in glob.glob(os.path.join(zip_root, '**', '*.html'), recursive=True): soup = BeautifulSoup(open(html_file, 'rb').read(), parser='html.parser') for img in self.remove_imgs: img_tag = soup.find('img', src = re.compile('{}$'.format(img))) if img_tag: if not img in self.removed_imgs: self.removed_imgs.append(img) replaced_imgs.append(img) img_tag.extract() f = open(html_file, 'wb') f.write(soup.prettify('utf-8')) f.close() zip_changed = True break else: assert img not in soup.prettify(), "Problem replacing image {} in {}".format(img, scorm_zip) if 'n2_tek_en_lan_u09' in scorm_zip: assert zip_changed, "Narrative SCORM module had no changes." assert 'kap_cerrar.png' in replaced_imgs, "Replaced images = {}".format(replaced_imgs) assert 'kap_cerrar.png' in self.removed_imgs, "Removed images = {}".format(self.removed_imgs) if zip_changed: temp_zip = create_predictable_zip(zip_root) scorm_zip = output_zip + '.zip' os.rename(temp_zip, scorm_zip) return scorm_zip
def download_sim(self, topic, sim, keywords, language): """ Download, zip, and add a node for a sim, as well as any associated video. """ localized_sim = sim["localizedSimulations"][0] print("\tProcessing sim:", localized_sim["title"]) dst = tempfile.mkdtemp() download_file( localized_sim["downloadUrl"], dst, filename="index.html", request_fn=sess.get, middleware_callbacks=[process_sim_html], ) zippath = create_predictable_zip(dst) authors = re.sub(" \(.*?\)", "", sim["credits"]["designTeam"]) authors = re.sub("<br\/?>", ", ", authors) title = localized_sim["title"] if language == "ar": if title in ARABIC_NAME_CATEGORY: title = ARABIC_NAME_CATEGORY[title] if title in SIM_TYPO: title = SIM_TYPO[title] # create a node for the sim simnode = HTML5AppNode( source_id="sim-%d" % localized_sim["id"], files=[HTMLZipFile(zippath)], title=title, description=sim["description"][language][:200], license=CC_BYLicense( "PhET Interactive Simulations, University of Colorado Boulder" ), # author=authors, # tags=[keywords[topic] for topic in sim["topicIds"]], thumbnail=sim["media"]["thumbnailUrl"], language=getlang(language), ) # if there's a video, extract it and put it in the topic right before the sim videos = sim["media"]["vimeoFiles"] if videos: video_url = [v for v in videos if v.get("height") == 540][0]["link"] videonode = VideoNode( source_id="video-%d" % localized_sim["id"], files=[VideoFile(video_url)], title="Video: %s" % localized_sim["title"], license=CC_BYLicense( "PhET Interactive Simulations, University of Colorado Boulder" ), thumbnail=sim["media"]["thumbnailUrl"], ) topic.add_child(videonode) # add the sim node into the topic topic.add_child(simnode)