def _is_likely_global_nav(url): """ Returns True if `url` is likely a global nav link based on how often seen in pages. """ seen_count = self.global_urls_seen_count[url] if debug: LOGGER.debug('seen_count/total_urls_seen_count=' + str(float(seen_count) / total_urls_seen_count) + '=' + str(seen_count) + '/' + str(total_urls_seen_count) + self.url_to_path(url)) # if previously determined to be a global nav link for global_nav_resource in global_nav_nodes['children']: if url == global_nav_resource['url']: return True # if new link that is seen a lot if float(seen_count ) / total_urls_seen_count > self.GLOBAL_NAV_THRESHOLD: return True return False
def scrape_content_page(content_page_url, lang): """ Download standalone HTML content pages (non-modules). Used for "Curriculum framework" and standalone pages in "Resources". Returns: page_info (dict): info necessary to constructing HTML5AppNode and HTMLZipFile - title - source_id - description - zip_path """ LOGGER.debug('Scrapring content page @ url = ' + str(content_page_url)) doc = get_parsed_html_from_url(content_page_url) destination = tempfile.mkdtemp() print('destination=', destination) source_id = parse_qs(urlparse(content_page_url).query)['id'][0] raw_title = doc.select_one("head title").text content_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() page_info = dict( lang=lang, source_id=source_id, title=content_title, description=None, children=[], ) # Do the actual download download_page(content_page_url, destination, 'index.html', lang) # zip it page_info['zip_path'] = create_predictable_zip(destination) # ship it return page_info
def on_special_subtopic_page(self, url, page, context): LOGGER.debug(' in on_special_subtopic_page ' + url) page_dict = dict( kind='special_subtopic_page', # redundant... -- mismatc with original special_subtopic_page url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: menu_row = page.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-2'}) print(str(menu_row)) except Exception as e: LOGGER.error('on_subtopic_page: %s : %s' % (e, page)) return for link in menu_row.find_all('a', {'class': 'list-group-item'}): try: title = link.get_text().strip() description = '' lesson_url = urljoin(url, link['href']) if self.should_ignore_url(lesson_url): LOGGER.info('ignoring lesson' + lesson_url) continue source_id = get_source_id(link['href']) LOGGER.debug(' special lesson: %s: %s' % (source_id, title)) context = dict( parent=page_dict, kind='fun_page', title=title, description=description, source_id=source_id, thumbnail_url=None, children=[], ) self.enqueue_url_and_context(lesson_url, context) # get_contents(node, link) except Exception as e: LOGGER.error('on_special_subtopic_page: %s : %s' % (e, link))
def generate_contentinfo_from_folder(self, csvwriter, rel_path, filenames): """ Create a topic node row in Content.csv for the folder at `rel_path` and add content node rows for all the files in the `rel_path` folder. """ LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames)) from ricecooker.utils.linecook import filter_filenames, filter_thumbnail_files, chan_path_from_rel_path # WRITE TOPIC ROW topicrow = self.channeldir_node_to_row( rel_path.split(os.path.sep) ) csvwriter.writerow(topicrow) # WRITE CONTENT NODE ROWS chan_path = chan_path_from_rel_path(rel_path, self.channeldir) filenames_cleaned = filter_filenames(filenames) # filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, self) for filename in filenames_cleaned: path_tuple = rel_path.split(os.path.sep) path_tuple.append(filename) filerow = self.channeldir_node_to_row(path_tuple) csvwriter.writerow(filerow)
def on_topic_page(self, url, page, context): LOGGER.debug('in on_topic_page ' + url) page_dict = dict( kind='topic_page', url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: body_row = page.find('div', {'id': 'body-row'}) menu_row = body_row.find('div', {'class': 'col-md-2'}) subtopics = menu_row.find_all('a') except Exception as e: LOGGER.error('ERROR get_subtopics: %s : %s' % (e, url)) return for subtopic in subtopics: try: subtopic_url = urljoin(url, subtopic['href']) if self.should_ignore_url(subtopic_url): print('ignoring subtopic', subtopic_url) continue title = get_text(subtopic) source_id = get_source_id(subtopic['href']) LOGGER.debug(' found subtopic: %s: %s' % (source_id, title)) context = dict( parent=page_dict, kind='subtopic_page', title=title, source_id=source_id, children=[], ) self.enqueue_url_and_context(subtopic_url, context) except Exception as e: LOGGER.error('on_topic_page: %s : %s' % (e, subtopic))
def download_zip_file(url): if not url: return (False, None) if get_suffix(url) != '.zip': return (False, None) response = sess.get(url) if response.status_code != 200: LOGGER.error("STATUS: {}, URL: {}", response.status_code, url) return (False, None) elif not response.from_cache: LOGGER.debug("NOT CACHED:", url) archive = zipfile.ZipFile(io.BytesIO(response.content)) archive_members = list( filter(lambda f: f.filename.endswith('.pdf'), archive.infolist())) archive_member_names = [None] * len(archive_members) for i, pdf in enumerate(archive_members): path = os.path.join(PDFS_DATA_DIR, pdf.filename) archive_member_names[i] = path if not os.path.exists(path): archive.extract(pdf, PDFS_DATA_DIR) return (True, archive_member_names)
def on_lang_page(self, url, page, context): LOGGER.debug('in on_lang_page ' + url) page_dict = dict( kind='lang_page', url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: menu_row = page.find('div', {'id': 'menu-row'}) except Exception as e: LOGGER.error('ERROR on_lang_page: %s : %s' % (e, url)) return for topic in menu_row.find_all('a'): try: if topic['href'] == '#': print('skipping', topic) continue topic_url = urljoin(url, topic['href'].strip()) print(topic_url) if self.should_ignore_url(topic_url): print('ignoring topic', topic_url) continue # metadata title = get_text(topic) source_id = get_source_id(topic['href'].strip()) subject_en = source_id # short string to match on top-level categories context = dict( parent=page_dict, title=title, source_id=source_id, subject_en=subject_en, ) # print('in on_lang_page topic.title=', title, 'topic_subject_id=', source_id, 'subject_en=', subject_en) # what type of tab is it? if 'Fun' in topic['href']: LOGGER.info('found fun page: %s: %s' % (source_id, title)) context['kind'] = 'fun_page' elif 'Story' in topic['href']: LOGGER.info('found story page: %s: %s' % (source_id, title)) context['kind'] = 'story_page' elif any([cid in topic['href'] for cid in SPECIAL_SUBTOPIC_COURSE_IDS]): LOGGER.info('FOUND three-tab special_subtopic_page page: %s: %s' % (source_id, title)) context['kind'] = 'special_subtopic_page' elif 'gamelist/CRS' in topic['href']: LOGGER.info('found top-level CRS page: %s: %s' % (source_id, title)) context['kind'] = 'fun_page' else: LOGGER.info('found topic: %s: %s' % (source_id, title)) context['kind'] = 'topic_page' self.enqueue_url_and_context(topic_url, context) # if DEBUG_MODE: # return except Exception as e: LOGGER.error('on_lang_page: %s : %s' % (e, topic))
def process_folder(channel, rel_path, filenames, metadata_provider): """ Create `ContentNode`s from each file in this folder and the node to `channel` under the path `rel_path`. """ LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames)) if not keep_folder(rel_path): return chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir) chan_path_tuple = path_to_tuple(chan_path) chan_path_list = list(chan_path_tuple) LOGGER.debug('chan_path_list=' + str(chan_path_list)) # FIND THE CONTAINING NODE (channel or topic) if len(chan_path_list) == 1: # CASE CHANNEL ROOT: `rel_path` points to `channeldir` # No need to create a topic node here since channel already exists containing_node = channel # attach content nodes in filenames directly to channel else: # CASE TOPIC FOLDER: `rel_path` points to a channelroot subfolder (a.k.a TopicNode) dirname = chan_path_list.pop( ) # name of the folder (used as ID for internal lookup) topic_parent_node = get_topic_for_path(channel, chan_path_list) # read topic metadata to get title and description for the TopicNode topic_metadata = metadata_provider.get(chan_path_tuple) thumbnail_chan_path = topic_metadata.get('thumbnail_chan_path', None) if thumbnail_chan_path: thumbnail_rel_path = rel_path_from_chan_path( thumbnail_chan_path, metadata_provider.channeldir) else: thumbnail_rel_path = None # create TopicNode for this folder topic = dict( kind=TOPIC_NODE, dirname=dirname, source_id='sourceid:' + rel_path, title=topic_metadata.get('title', dirname), description=topic_metadata.get('description', None), author=topic_metadata.get('author', None), language=topic_metadata.get('language', None), license=topic_metadata.get('license', None), thumbnail=thumbnail_rel_path, children=[], ) topic_parent_node['children'].append(topic) containing_node = topic # attach content nodes in filenames to the newly created topic # filter filenames filenames_cleaned = filter_filenames(filenames) filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, metadata_provider) # PROCESS FILES for filename in filenames_cleaned2: chan_filepath = os.path.join(chan_path, filename) chan_filepath_tuple = path_to_tuple(chan_filepath) metadata = metadata_provider.get(chan_filepath_tuple) node = make_content_node(metadata_provider.channeldir, rel_path, filename, metadata) containing_node['children'].append( node) # attach content node to containing_node
def transform_html_vertical(vertical, parent_title=None): """ Parses the `html` children of the vertical to generate document nodes from linked pdfs, extract downloadable resources, or a standalone html5 app node of the html content for all other cases. Returns: nodes, downloadable_resources """ if 'children' not in vertical: LOGGER.warning('found empty vertical' + str(vertical)) return [], [] assert all(ch['kind'] == 'html' for ch in vertical['children']), 'non htmls found' nodes = [] downloadable_resources = [] htmls = [ch for ch in vertical['children'] if ch['kind'] == 'html'] for html in htmls: if 'downloadable_resources' in html and html['downloadable_resources']: LOGGER.debug(' found downloadable_resources') resources = html['downloadable_resources'] for resource in resources: ext = resource['ext'] if ext == 'pdf': pdf_node = dict( kind=content_kinds.DOCUMENT, title=resource['title'], description=resource.get('description', ''), source_id=resource['relhref'], license=EDRAAK_LICENSE, language=getlang('ar').code, files=[], ) file_dict = dict( file_type=file_types.DOCUMENT, path=resource['relhref'], language=getlang('ar').code, ) pdf_node['files'].append(file_dict) nodes.append(pdf_node) else: downloadable_resources.append(resource) else: LOGGER.debug(' packaging html content') html5app_dict = dict( kind=content_kinds.HTML5, title=vertical['display_name'], # title=EDRAAK_STRINGS['downloadable_resources'], description=html.get('description', ''), source_id=html['url_name'], license=EDRAAK_LICENSE, language=getlang('ar').code, files=[], ) zip_path = package_html_content_as_html5_zip_file(html) zip_file = dict( file_type=file_types.HTML5, path=zip_path, language=getlang('ar').code, ) html5app_dict['files'].append(zip_file) nodes.append(html5app_dict) # return nodes, downloadable_resources
def transform_tree(clean_tree, coursedir): course_id = clean_tree['course'] course_title = clean_tree['display_name'] course_thumbnail = os.path.join(coursedir, 'static', clean_tree['course_image']) if not os.path.exists(course_thumbnail): course_image_with_spaces = clean_tree['course_image'].replace('_', ' ') course_thumbnail = os.path.join(coursedir, 'static', course_image_with_spaces) course_dict = dict( kind=content_kinds.TOPIC, title=course_title, thumbnail=course_thumbnail, source_id=course_id, description='', language=getlang('ar').code, license=EDRAAK_LICENSE, children=[], ) for chapter in clean_tree['children']: chapter_dict = dict( kind=content_kinds.TOPIC, title=chapter['display_name'], source_id=chapter['url_name'], description='', language=getlang('ar').code, license=EDRAAK_LICENSE, children=[], ) course_dict['children'].append(chapter_dict) chapter_downloadable_resources = [] for sequential in chapter['children']: # SPECIAL CASE: skip empty parent nodes of discussions if len(sequential['children']) == 0: LOGGER.debug('Skipping empty sequential ' + str(sequential)) continue # DEFAULT CASE: process as regular topic node sequential_dict = dict( kind=content_kinds.TOPIC, title=sequential['display_name'], source_id=sequential['url_name'], description=sequential.get('description', ''), language=getlang('ar').code, license=EDRAAK_LICENSE, children=[], ) chapter_dict['children'].append(sequential_dict) for vertical in sequential['children']: vertical_type = guess_vertical_type(vertical) if vertical_type in [ 'knowledge_check_vertical', 'test_vertical' ]: exercise_dict = transform_vertical_to_exercise(vertical) if exercise_dict: sequential_dict['children'].append(exercise_dict) elif vertical_type == 'video_vertical': video_dict, downloadable_resources = transform_video_vertical( vertical) if video_dict: sequential_dict['children'].append(video_dict) chapter_downloadable_resources.extend( downloadable_resources) elif vertical_type == 'html_vertical': nodes, downloadable_resources = transform_html_vertical( vertical) if nodes: sequential_dict['children'].extend(nodes) chapter_downloadable_resources.extend( downloadable_resources) else: LOGGER.debug('skipping ' + vertical_type + ' url_name=' + vertical['url_name']) # if chapter_downloadable_resources: LOGGER.debug(' Packaging chapter_downloadable_resources') source_id = chapter['url_name'] + '-downloadable-resources' html5app_dict = dict( kind=content_kinds.HTML5, title=EDRAAK_STRINGS['downloadable_resources'], description=EDRAAK_STRINGS[ 'downloadable_resources_description'], source_id=source_id, license=EDRAAK_LICENSE, language=getlang('ar').code, files=[], ) zip_path = make_html5zip_from_resources( chapter_downloadable_resources, basefilename=source_id + '2') zip_file = dict( file_type=file_types.HTML5, path=zip_path, language=getlang('ar').code, ) html5app_dict['files'].append(zip_file) chapter_dict['children'].append(html5app_dict) flattened_course_dict = flatten_transformed_tree(course_dict) return flattened_course_dict
def on_lesson_page(self, url, page, context): LOGGER.debug(' in on_lesson_page' + url) page_dict = dict( kind='lessons_page', url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: menu_row = page.find('div', {'id': 'row-exu'}) except Exception as e: LOGGER.error('on_lesson_page: %s : %s' % (e, page)) return contents = menu_row.find_all('div', {'class': 'col-md-3'}) for content in contents: try: title = get_text(content.find('div', {'class': 'txtline'})) # TODO: description thumbnail = content.find('a').find('img')['src'] thumbnail = get_absolute_path(thumbnail) main_file, master_file, source_id = get_content_link(content) LOGGER.debug(' content: %s: %s' % (source_id, title)) if self.should_ignore_url(main_file): print('ignoring content', title, main_file) continue if len(main_file) < 10: print('something strage --- short main file url with title - main_file - master_file ', title, '-', main_file, '-', master_file) if main_file.endswith('mp4') or main_file.endswith('MP4') or main_file.endswith('m4v'): video = dict( url=main_file, kind='PrathamVideoResource', description='source_url=' + main_file if DEBUG_MODE else '', title=title, source_id=source_id, thumbnail_url=thumbnail, children=[], ) video.update(self.get_video_metadata(main_file)) page_dict['children'].append(video) elif main_file.endswith('pdf'): pdf = dict( url=main_file, kind='PrathamPdfResource', title=title, description='source_url=' + main_file if DEBUG_MODE else '', source_id=source_id, thumbnail_url=thumbnail, children=[], ) page_dict['children'].append(pdf) elif main_file.endswith('html') and master_file.endswith('zip'): if '.~' in master_file: # Fix broken links of the form https://www.prathamopenschool.org/Gj/gamelist/CRS174/.~/CourseContent/Games/NumberKas_GJ.zip pathels = master_file.split('/') master_file = '/'.join(pathels[0:3] + pathels[7:]) zipfile = dict( url=master_file, kind='PrathamZipResource', title=title, description='source_url=' + master_file if DEBUG_MODE else '', source_id=source_id, thumbnail_url=thumbnail, main_file=main_file, # needed to rename to index.html if different children=[], ) page_dict['children'].append(zipfile) else: LOGGER.error('ZZZZ>>> Content not supported: onpage=%s main_file=%s master_file=%s' % (url, main_file, master_file)) unsupported_rsrc = dict( url=main_file, referring_url=url, kind='UnsupportedPrathamWebResource', title=title, source_id=source_id, thumbnail_url=thumbnail, children=[], ) page_dict['children'].append(unsupported_rsrc) except Exception as e: LOGGER.error('zz _process_contents: %s : %s' % (e, content))
def archive_page(url, download_root): """ Download fully rendered page and all related assets into ricecooker's site archive format. :param url: URL to download :param download_root: Site archive root directory :return: A dict containing info about the page archive operation """ os.makedirs(download_root, exist_ok=True) content, props = asyncio.get_event_loop().run_until_complete(load_page(url)) parsed_url = urlparse(url) page_domain = parsed_url.netloc.replace(':', '_') # get related assets base_url = url[:url.rfind('/')] urls_to_replace = {} if content: def html5_derive_filename(url): return get_archive_filename(url, page_domain, download_root, urls_to_replace) download_static_assets(content, download_root, base_url, derive_filename=html5_derive_filename) for key in urls_to_replace: url_parts = urlparse(key) # When we get an absolute URL, it may appear in one of three different ways in the page: key_variants = [ # 1. /path/to/file.html key.replace(url_parts.scheme + '://' + url_parts.netloc, ''), # 2. https://www.domain.com/path/to/file.html key, # 3. //www.domain.com/path/to/file.html key.replace(url_parts.scheme + ':', ''), ] orig_content = content for variant in key_variants: # searching within quotes ensures we only replace the exact URL we are # trying to replace # we avoid using BeautifulSoup because Python HTML parsers can be destructive and # do things like strip out the doctype. content = content.replace('="{}"'.format(variant), '="{}"'.format(urls_to_replace[key])) content = content.replace('url({})'.format(variant), 'url({})'.format(urls_to_replace[key])) if content == orig_content: LOGGER.debug("link not replaced: {}".format(key)) LOGGER.debug("key_variants = {}".format(key_variants)) download_dir = os.path.join(page_domain, parsed_url.path.split('/')[-1].replace('?', '_')) download_path = os.path.join(download_root, download_dir) os.makedirs(download_path, exist_ok=True) index_path = os.path.join(download_path, 'index.html') f = open(index_path, 'w', encoding='utf-8') f.write(content) f.close() page_info = { 'url': url, 'cookies': props['cookies'], 'index_path': index_path, 'resources': list(urls_to_replace.values()) } return page_info return None
def _build_json_tree(parent_node, sourcetree, lang=None): # type: (dict, List[dict], str) -> None """ Parse the web resource nodes given in `sourcetree` and add as children of `parent_node`. """ # EXPECTED_NODE_TYPES = ['TessaLangWebRessourceTree', 'TessaCategory', 'TessaSubpage', # 'TessaModule'] for source_node in sourcetree: if 'kind' not in source_node: print('kind-less source_node', source_node) continue kind = source_node['kind'] # if kind not in EXPECTED_NODE_TYPES: # raise NotImplementedError('Unexpected web resource node type encountered.') if kind == 'TessaLangWebRessourceTree': # this is the root of the tree, no special attributes, just process children source_tree_children = source_node.get("children", []) _build_json_tree(parent_node, source_tree_children, lang=lang) elif kind == 'TessaSubpage': child_node = dict( kind=content_kinds.TOPIC, source_id=source_node['source_id'], title=source_node['title'], author='TESSA', description='', # 'TODO description of ' + source_node['url'], thumbnail=source_node.get("thumbnail"), children=[], ) parent_node['children'].append(child_node) LOGGER.debug('Created new TopicNode for TessaSubpage titled ' + child_node['title']) source_tree_children = source_node.get("children", []) _build_json_tree(child_node, source_tree_children, lang=lang) elif kind == 'TessaAudioResourcesSubpage': child_node = dict( kind=content_kinds.TOPIC, source_id=source_node['source_id'], title=source_node['title'], author='TESSA', description='', # 'TODO description of ' + source_node['url'], thumbnail=source_node.get("thumbnail"), children=[], ) parent_node['children'].append(child_node) LOGGER.debug( 'Created new TopicNode for TessaAudioResourcesSubpage titled ' + child_node['title']) source_tree_children = source_node.get("children", []) _build_json_tree(child_node, source_tree_children, lang=lang) elif kind == 'TessaAudioResourceTopicSubpage': child_node = dict( kind=content_kinds.TOPIC, source_id=source_node['source_id'], title=source_node['title'], author='TESSA', description='', # 'TODO description of ' + source_node['url'], thumbnail=source_node.get("thumbnail"), children=[], ) parent_node['children'].append(child_node) LOGGER.debug( 'Created new TopicNode for TessaAudioResourceTopicSubpage titled ' + child_node['title']) source_tree_children = source_node.get("children", []) _build_json_tree(child_node, source_tree_children, lang=lang) elif kind == 'TessaAudioResourceSection': child_node = dict( kind=content_kinds.TOPIC, source_id=source_node['source_id'], title=source_node['title'], author='TESSA', description='', # 'TODO description of ' + source_node['url'], thumbnail=source_node.get("thumbnail"), children=[], ) parent_node['children'].append(child_node) LOGGER.debug( 'Created new TopicNode for TessaAudioResourceSection titled ' + child_node['title']) source_tree_children = source_node.get("children", []) _build_json_tree(child_node, source_tree_children, lang=lang) elif kind == 'TessaModule': child_node = dict( kind=content_kinds.HTML5, source_id=source_node['source_id'], language=source_node['lang'], title=source_node['title'], description= '', # 'fake descri', # TODO source_node['description'] license=TESSA_LICENSE, files=[], ) zip_path = download_module(source_node['url'], lang=source_node['lang']) module_html_file = dict( file_type=file_types.HTML5, path=zip_path, language=source_node['lang'], ) child_node['files'] = [module_html_file] parent_node['children'].append(child_node) LOGGER.debug('Created HTML5AppNode for TessaModule titled ' + child_node['title']) elif kind == 'TessaContentPage': page_info = scrape_content_page(source_node['url'], lang) child_node = dict( kind=content_kinds.HTML5, source_id=source_node['source_id'], language=source_node['lang'], title=source_node['title'], description=source_node.get('description', ''), license=TESSA_LICENSE, files=[], ) module_html_file = dict( file_type=file_types.HTML5, path=page_info['zip_path'], language=source_node['lang'], ) child_node['files'] = [module_html_file] parent_node['children'].append(child_node) LOGGER.debug('Created HTML5AppNode for TessaContentPage titled ' + child_node['title']) elif kind == 'TessaAudioResouce': child_node = dict( kind=content_kinds.AUDIO, source_id=source_node['source_id'], language=source_node['lang'], title=source_node.get('title', 'NOTITLE'), description= '', # 'fake descri', # TODO source_node['description'] license=TESSA_LICENSE, files=[], ) mp3_file = dict( file_type=file_types.AUDIO, path=source_node['url'], language=source_node['lang'], ) child_node['files'] = [mp3_file] parent_node['children'].append(child_node) LOGGER.debug('Created AudioNode from file url ' + source_node['url']) elif kind == 'TessaPDFDocument': child_node = dict( kind=content_kinds.DOCUMENT, source_id=source_node['source_id'], language=source_node['lang'], title=source_node.get('title', 'NOTITLE'), description= '', # 'fake descri', # TODO source_node['description'] license=TESSA_LICENSE, files=[], ) pdf_file = dict( file_type=file_types.DOCUMENT, path=source_node['url'], language=source_node['lang'], ) child_node['files'] = [pdf_file] parent_node['children'].append(child_node) LOGGER.debug('Created PDF Document Node from url ' + source_node['url']) else: # LOGGER.critical("Encountered an unknown content node format.") print('***** Skipping content kind', source_node['kind'], 'titled', source_node.get('title', 'NOTITLE')) continue return parent_node
def download_module_no_toc(module_url, lang=None): """ Extracting the module table of contents from the sidebad nav doesn't work for certain modules in FR e.g. http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=105334§ion=1.1 If NO TOC is available, then we'll crawl pages one by one (`module_contents_dict`) """ LOGGER.debug('Scrapring module @ url = ' + str(module_url)) doc = get_parsed_html_from_url(module_url) destination = tempfile.mkdtemp() print('destination=', destination) # copy css/js/images from skel shutil.copytree('chefdata/templates/module_skel/styles', os.path.join(destination, 'styles')) source_id = parse_qs(urlparse(module_url).query)['id'][0] raw_title = doc.select_one("head title").text module_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() module_contents_dict = dict( kind='TessaModuleContentsDict', source_id=source_id, title=module_title, lang=lang, children=[], ) # print(module_contents_dict) # recusively download all sections by following "Next" links current_url = module_url current_section = None is_first_section = True while True: LOGGER.debug('processing current_url' + str(current_url)) current_doc = get_parsed_html_from_url(current_url) # special handling for module-level page (no section in url but is really Section 1) if is_first_section: section_filename = 'section-1.html' is_first_section = False else: section_filename = get_section_filename(current_url) # Do the actual download download_section(current_url, destination, section_filename, lang) # Store section/subsecito info so we can build TOC later doc = get_parsed_html_from_url(current_url) raw_title = doc.select_one("head title").text the_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() # sections e.g. section-3.html if '_' not in section_filename: section_dict = dict(kind='TessaModuleContentsSection', title=the_title, href=current_url, filename=section_filename, children=[]) module_contents_dict['children'].append(section_dict) print(' - section:', the_title[0:80]) current_section = section_dict # subsections e.g. section-3_2.html else: subsection_title = the_title.replace(module_title, '') subsection_title.replace(current_section['title'], '') subsection_title = subsection_title.lstrip() if subsection_title.startswith(': '): subsection_title = subsection_title.replace(': ', '', 1) subsection_dict = dict( kind='TessaModuleContentsSubsection', title=subsection_title, href=current_url, filename=section_filename, ) print(' - subsection:', subsection_title[0:80]) current_section['children'].append(subsection_dict) # Recurse if next next_url = _get_next_section_url(current_doc) if next_url: current_url = next_url else: break # for debugging... # pp.pprint(module_contents_dict) module_index_tmpl = jinja2.Template( open('chefdata/templates/module_index.html').read()) index_contents = module_index_tmpl.render(module=module_contents_dict) with open(os.path.join(destination, "index.html"), "w") as f: f.write(index_contents) # return module_contents_dict zip_path = create_predictable_zip(destination) return zip_path
def download_module(module_url, lang=None): LOGGER.debug('Scrapring module @ url = ' + module_url) doc = get_parsed_html_from_url(module_url) source_id = parse_qs(urlparse(module_url).query)['id'][0] raw_title = doc.select_one("head title").text module_title = raw_title.replace('OLCreate:', '')\ .replace('TESSA_ARABIC', '')\ .replace('TESSA_Eng', '')\ .replace('TESSA_Fr', '')\ .strip() module_contents_dict = dict( kind='TessaModuleContentsDict', lang=lang, source_id=source_id, title=module_title, children=[], ) # TRY TO CREATE MODULE TOC SIDEBAR MENU ############################################################################ current_li_deep = doc.find('li', class_='oucontent-tree-current') # Sept 5th: special treatement for modules with no TOC in sidebar if current_li_deep is None: return download_module_no_toc(module_url, lang=lang) # CREATE MODULE TOC SIDEBAR MENU # July 28 HACK : infer module_toc_li using marker on sublist-li ############################################################################ destination = tempfile.mkdtemp() print('destination=', destination) # copy css/js/images from skel shutil.copytree('chefdata/templates/module_skel/styles', os.path.join(destination, 'styles')) is_first_section = True module_toc_li = current_li_deep.find_parent('li', class_='item-section') # print(module_toc_li.prettify()) # module_contents_div = module_toc_li.find('div', class_='oucontent-contents') outer_module_ul = module_toc_li.find('ul', class_='child-item-list', recursive=False) inner_module_ul = outer_module_ul.find( 'div', class_='oucontent-contents').find('ul', recursive=False) section_lis = inner_module_ul.find_all('li', recursive=False) print(len(section_lis)) # DETECT IF SIMPLE MODULE (single page, so sections) OR COMPLEX MODULE (with sections) if len(section_lis) == 0: print('UNEXPECTED -------- len(section_lis) == 0') print(module_url, '<<< <<< ' * 6) if len(section_lis) == 1: is_simple_module = True else: is_simple_module = False # SIMPLE MODULES THAT CONSIST OF A SINGLE PAGE -- becomes index.html if is_simple_module: section_li = section_lis[0] # print('*'*120) # print(section_li.prettify()) section_title_span = section_li.find('span', class_='oucontent-tree-item') section_title = get_text(section_title_span) print('Processing simple module:', section_title) section_dict = dict( kind='TessaModuleContentsSection', title=section_title, href=module_url, filename='index.html', # TODO: figure out if this is necessary children=[], ) # print(' section:', section_title) module_contents_dict['children'].append(section_dict) subsections_ul = section_li.find('ul', recursive=False) if subsections_ul: pass #print('found some subsections...') else: pass #print('no subsections <ul> found in this section') download_page(module_url, destination, 'index.html', lang) # /SIMPLE MODULE # COMPLEX MODULES WITH SECTIONS AND custom-made TOC in index.html else: for section_li in section_lis: if 'download individual sections' in get_text( section_li): # TODO: AR, SW, FR print( 'skipping section "Read or download individual sections..." ' ) continue # print(section_li.prettify()) # print('>'*80) section_title_span = section_li.find('span', class_='oucontent-tree-item') if section_title_span: if section_title_span.find('span', class_='current-title'): section_href = module_url else: section_a = section_title_span.find('a') if section_a: section_href = section_a['href'] else: section_href = '#NOLINK' # for sections like "Top 20 ideas for teaching large classes" else: section_href = '#NOLINK' # for sections like "Read or download individual sections of the m..." # special case for first section --- since it doesn't save section in filename # manually call download_page with filename section_1.html with contents of current page if is_first_section: section_filename = 'section-1.html' is_first_section = False else: if '#NOLINK' not in section_href: section_filename = get_section_filename(section_href) # accesshide_span = section_title_span.find('span', class_='accesshide') # if accesshide_span: # accesshide_span.extract() # subsections_ul.extract() section_title = get_text(section_title_span) section_dict = dict( kind='TessaModuleContentsSection', title=section_title, href=section_href, filename=section_filename, children=[], ) # print(' section:', section_title) module_contents_dict['children'].append(section_dict) subsections_ul = section_li.find('ul', recursive=False) if subsections_ul: subsection_lis = subsections_ul.find_all('li') for subsection_li in subsection_lis: # print('<'*100) # print(subsection_li) #print('>>>>>') #print(subsection_li.prettify()) subsection_link = subsection_li.find('a') if not subsection_link: # handle wrird LOGGER.warning('((((( Skipping section ' + subsection_li.get_text() + ' because no subsection_link') continue subsection_href = subsection_link['href'] subsection_filename = get_section_filename(subsection_href) # subaccesshide_span = subsection_li.find('span', class_='accesshide') # if subaccesshide_span: # subaccesshide_span.extract() subsection_title = get_text(subsection_li) subsection_dict = dict( kind='TessaModuleContentsSubsection', title=subsection_title, href=subsection_href, filename=subsection_filename, ) # print(' subsection:', subsection_title) section_dict['children'].append(subsection_dict) else: print('no subsections <ul> found in this section') module_index_tmpl = jinja2.Template( open('chefdata/templates/module_index.html').read()) index_contents = module_index_tmpl.render(module=module_contents_dict) with open(os.path.join(destination, "index.html"), "w") as f: f.write(index_contents) # download the html content from each section/subsection for section in module_contents_dict['children']: if '#NOLINK' in section['href']: print('nothing to download for #NOLINK section') continue download_section(section['href'], destination, section['filename'], lang) for subsection in section['children']: if '#NOLINK' in subsection['href']: print('nothing to download for #NOLINK subsection') continue download_section(subsection['href'], destination, subsection['filename'], lang) # /COMPLEX MODULE zip_path = create_predictable_zip(destination) return zip_path
def modify_zip(self, scorm_zip): """ The SCORM modules we receive in some cases have graphics that reference UI elements that don't exist in Kolibri. This function modifies the zip to remove them and returns the modified zip. :param scorm_zip: The path to the original zip file. :return: Path to the modified zip file, if it exists. """ zip_dir_name = os.path.splitext(os.path.basename(scorm_zip))[0] zip_root = os.path.join(self.temp_dir, zip_dir_name) output_zip = os.path.join(self.temp_dir, 'out_zips', zip_dir_name) os.makedirs(zip_root, exist_ok=True) os.makedirs(os.path.dirname(output_zip), exist_ok=True) zip = zipfile.ZipFile(scorm_zip) zip.extractall(zip_root) zip_changed = False telas_end_sprites = os.path.join(zip_root, 'curso', 'telas', 'end', 'sprites.png') if os.path.exists(telas_end_sprites): LOGGER.debug("Deleting sprites at {}".format(telas_end_sprites)) os.remove(telas_end_sprites) zip_changed = True else: assert "n1_ted_len_en_u01_v02" not in scorm_zip, os.listdir(zip_root) for replace_img in self.replace_images: img_glob = glob.glob(os.path.join(zip_root, '**', replace_img), recursive=True) for img in img_glob: os.remove(img) shutil.copy(os.path.join(ROOT_DIR, 'assets', replace_img), img) if not replace_img in self.replaced_images: self.replaced_images.append(replace_img) zip_changed = True # make any HTML replacements replaced_imgs = [] for html_file in glob.glob(os.path.join(zip_root, '**', '*.html'), recursive=True): soup = BeautifulSoup(open(html_file, 'rb').read(), parser='html.parser') for img in self.remove_imgs: img_tag = soup.find('img', src = re.compile('{}$'.format(img))) if img_tag: if not img in self.removed_imgs: self.removed_imgs.append(img) replaced_imgs.append(img) img_tag.extract() f = open(html_file, 'wb') f.write(soup.prettify('utf-8')) f.close() zip_changed = True break else: assert img not in soup.prettify(), "Problem replacing image {} in {}".format(img, scorm_zip) if 'n2_tek_en_lan_u09' in scorm_zip: assert zip_changed, "Narrative SCORM module had no changes." assert 'kap_cerrar.png' in replaced_imgs, "Replaced images = {}".format(replaced_imgs) assert 'kap_cerrar.png' in self.removed_imgs, "Removed images = {}".format(self.removed_imgs) if zip_changed: temp_zip = create_predictable_zip(zip_root) scorm_zip = output_zip + '.zip' os.rename(temp_zip, scorm_zip) return scorm_zip
def convert_ka_node_to_ricecooker_node(self, ka_node, target_lang=None): """ Convert a KA node (a subclass of `KhanNode`) to a ricecooker node (dict). Returns None if node slug is blacklisted or inadmissable for inclusion due to another reason (e.g. undtranslated video and no subs available). """ if ka_node.slug in self.slug_blacklist: return None if isinstance(ka_node, KhanTopic): LOGGER.debug('Converting ka_node ' + ka_node.slug + ' to ricecooker json') topic = dict( kind=content_kinds.TOPIC, source_id=ka_node.id, title=ka_node.title, description=ka_node.description[:400] if ka_node.description else '', slug=ka_node.slug, children=[], ) for ka_node_child in ka_node.children: if isinstance( ka_node_child, KhanTopic ) and ka_node_child.slug in self.topic_replacements: # This topic must be replaced by a list of other topic nodes replacements = self.topic_replacements[ka_node_child.slug] LOGGER.debug('Replacing ka_node ' + ka_node.slug + ' with replacements=' + str(replacements)) for r in replacements: rtopic = dict( kind=content_kinds.TOPIC, source_id=r['slug'], title=r['translatedTitle'], # guaranteed to exist description=r.get('description'), # (optional) slug=r['slug'], children=[], ) topic["children"].append(rtopic) LOGGER.debug(' >>> rtopic = ' + rtopic["slug"]) for rchild in r['children']: # guaranteed to exist LOGGER.debug(' >>>> rchild["slug"] = ' + rchild["slug"]) if 'children' not in rchild: # CASE A: two-level replacement hierarchy rchild_ka_node = self.topics_by_slug.get( rchild['slug']) if rchild_ka_node: if 'translatedTitle' in rchild: rchild_ka_node.title = rchild[ 'translatedTitle'] rchildtopic = self.convert_ka_node_to_ricecooker_node( rchild_ka_node, target_lang=target_lang) if rchildtopic: rtopic["children"].append(rchildtopic) else: LOGGER.warning( 'Failed to find rchild slug=' + rchild['slug']) else: # CASE B: three-level replacement hierarchy rchildtopic = dict( kind=content_kinds.TOPIC, source_id=rchild['slug'], title=rchild[ 'translatedTitle'], # guaranteed to exist description=rchild.get( 'description'), # (optional) slug=rchild['slug'], children=[], ) rtopic["children"].append(rchildtopic) for rgrandchild in rchild['children']: rgrandchild_slug = rgrandchild['slug'] LOGGER.debug( ' >>> rgrandchild_slug = ' + rgrandchild_slug) rgrandchild_ka_node = self.topics_by_slug.get( rgrandchild_slug) if rgrandchild_ka_node: if 'translatedTitle' in rgrandchild: rgrandchild_ka_node = rgrandchild[ 'translatedTitle'] rgrandchildtopic = self.convert_ka_node_to_ricecooker_node( rgrandchild_ka_node, target_lang=target_lang) if rgrandchildtopic: rchildtopic["children"].append( rgrandchildtopic) else: LOGGER.warning( 'Failed to find rgrandchild slug=' + rgrandchild_slug) else: # This is the more common case (no replacement), just add... child = self.convert_ka_node_to_ricecooker_node( ka_node_child, target_lang=target_lang, ) if child: topic["children"].append(child) # Skip empty topics if topic["children"]: return topic else: return None elif isinstance(ka_node, KhanExercise): if ka_node.mastery_model in EXERCISE_MAPPING: mastery_model = EXERCISE_MAPPING[ka_node.mastery_model] else: LOGGER.warning( "Unknown mastery model ({}) for exercise with id: {}". format(ka_node.mastery_model, ka_node.id)) mastery_model = exercises.M_OF_N # common core tags tags = [] if ka_node.slug in CC_MAPPING: tags.append(CC_MAPPING[ka_node.slug]) exercise = dict( kind=content_kinds.EXERCISE, source_id=ka_node.id, title=ka_node.title, description=ka_node.description[:400] if ka_node.description else '', exercise_data=mastery_model, license=dict( license_id=licenses.SPECIAL_PERMISSIONS, copyright_holder="Khan Academy", description= "Permission granted to distribute through Kolibri for non-commercial use", ), # need to formalize with KA thumbnail=ka_node.thumbnail, slug=ka_node.slug, questions=[], tags=tags, ) for ka_assessment_item in ka_node.get_assessment_items(): if ka_assessment_item.data and ka_assessment_item.data != "null": assessment_item = dict( question_type=exercises.PERSEUS_QUESTION, id=ka_assessment_item.id, item_data=ka_assessment_item.data, source_url=ka_assessment_item.source_url, ) exercise["questions"].append(assessment_item) # if there are no questions for this exercise, return None if not exercise["questions"]: return None return exercise elif isinstance(ka_node, KhanVideo): le_target_lang = target_lang DUBBED_VIDEOS = DUBBED_VIDEOS_BY_LANG.get(le_target_lang, []) target_lang = VIDEO_LANGUAGE_MAPPING.get(target_lang, target_lang) if ka_node.youtube_id != ka_node.translated_youtube_id: if ka_node.lang != target_lang.lower(): LOGGER.info( "Node with youtube id: {} and translated id: {} has wrong language" .format(ka_node.youtube_id, ka_node.translated_youtube_id)) return None files = [ dict( file_type="video", youtube_id=ka_node.translated_youtube_id, high_resolution=False, download_settings={ 'postprocessors': [{ 'key': 'ExecAfterDownload', 'exec_cmd': 'ffmpeg -hide_banner -loglevel panic -i {} -b:a 32k -ac 1 {}_tmp.mp4 && mv {}_tmp.mp4 {}', }] }) ] # Find all subtitles that are available for this video subtitle_languages = get_subtitle_languages( ka_node.translated_youtube_id) # if we dont have video in target lang or subtitle not available in target lang, return None if ka_node.lang != target_lang.lower(): if ka_node.translated_youtube_id in DUBBED_VIDEOS: pass # videos known to be transalted and should be included elif not any( should_include_subtitle(sub_code, le_target_lang) for sub_code in subtitle_languages): LOGGER.error( "Untranslated video {} and no subs available. Skipping." .format(ka_node.translated_youtube_id)) return None for lang_code in subtitle_languages: if is_youtube_subtitle_file_supported_language(lang_code): if target_lang == "en": # KA English is special: use subs for all available langs files.append( dict( file_type="subtitles", youtube_id=ka_node.translated_youtube_id, language=lang_code, )) elif should_include_subtitle(lang_code, le_target_lang): files.append( dict( file_type="subtitles", youtube_id=ka_node.translated_youtube_id, language=lang_code, )) else: LOGGER.debug( 'Skipping subs with lang_code {} for video {}'. format(lang_code, ka_node.translated_youtube_id)) # convert KA's license format into our internal license classes if ka_node.license in LICENSE_MAPPING: license = LICENSE_MAPPING[ka_node.license] else: # license = licenses.CC_BY_NC_SA # or? LOGGER.error( "Unknown license ({}) on video with youtube id: {}".format( ka_node.license, ka_node.translated_youtube_id)) return None video = dict( kind=content_kinds.VIDEO, # POLICY: set the `source_id` based on the `youtube_id` of the # original English video and not the `translated_youtube_id`: source_id=ka_node.youtube_id, title=ka_node.title, description=ka_node.description[:400] if ka_node.description else '', license=license, thumbnail=ka_node.thumbnail, files=files, ) return video elif isinstance(ka_node, KhanArticle): # TODO return None
def on_fun_page(self, url, page, context): """ This handles pages of the form gamelist/CRS??? and hn/Fun that contain direct links to resources without the topics and subtopic hierarchy. """ LOGGER.debug(' in on_fun_page' + url) page_dict = dict( kind='fun_page', url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: body_row = page.find('div', {'id': 'body-row'}) contents_row = body_row.find('div', {'class': 'row'}) except Exception as e: LOGGER.error('ERROR on_fun_page: %s : %s' % (e, url)) return contents = contents_row.find_all('div', {'class': 'col-md-3'}) for content in contents: try: title = get_text(content.find('div', {'class': 'txtline'})) # TODO: description thumbnail = content.find('a').find('img')['src'] thumbnail = get_absolute_path(thumbnail) # get_fun_content_link link = content.find('a') source_id = link['href'][1:] fun_resource_url = get_absolute_path(link['href']) # direct_download_url = None direct_download_link = content.find('a', class_='dnlinkfunstory') if direct_download_link: direct_download_href = direct_download_link['href'].strip() # direct_download_url = get_absolute_path(direct_download_href) # Need to GET the FunResource detail page since main_file is not in avail. in listing fun_rsrc_html = requests.get(fun_resource_url).text respath_url = get_respath_url_from_html(fun_rsrc_html) fun_doc = BeautifulSoup(fun_rsrc_html, "html.parser") download_url = get_download_url_from_doc(url, fun_doc) respath_path = urlparse(respath_url).path if self.should_ignore_url(respath_url): print('ignoring fun content', title, respath_url) continue LOGGER.debug(' Fun content: %s: %s at %s' % (source_id, title, respath_url)) if respath_path.endswith('mp4') or respath_path.endswith('MP4') or respath_path.endswith('m4v'): video = dict( url=respath_url, kind='PrathamVideoResource', title=title, description='source_url=' + respath_url if DEBUG_MODE else '', source_id=source_id, thumbnail_url=thumbnail, children=[], ) video.update(self.get_video_metadata(respath_url)) page_dict['children'].append(video) elif respath_path.endswith('pdf'): pdf = dict( url=respath_url, kind='PrathamPdfResource', description='source_url=' + respath_url if DEBUG_MODE else '', title=title, source_id=source_id, thumbnail_url=thumbnail, children=[], ) page_dict['children'].append(pdf) elif download_url and download_url.endswith('zip'): if '.~' in download_url: # Fix broken links of the form https://www.prathamopenschool.org/Gj/gamelist/CRS174/.~/CourseContent/Games/NumberKas_GJ.zip pathels = download_url.split('/') download_url = '/'.join(pathels[0:3] + pathels[7:]) zipfile = dict( url=download_url, kind='PrathamZipResource', title=title, description='source_url=' + download_url if DEBUG_MODE else '', source_id=source_id, thumbnail_url=thumbnail, main_file=respath_url, # needed to rename to index.html if different children=[], ) page_dict['children'].append(zipfile) elif respath_path.endswith('html'): download_url = respath_url.replace('/index.html', '.zip') html_rsrc = dict( url=download_url, kind='PrathamZipResource', # used to be OtherPrathamHtmlResource title=title, description='source_url=' + download_url if DEBUG_MODE else '', source_id=source_id, thumbnail_url=thumbnail, main_file=respath_url, children=[], ) page_dict['children'].append(html_rsrc) else: LOGGER.error('ZZZZ>>> Fun resource not supported: onpage=%s respath_path=%s download_url=%s' % (url, respath_path, download_url)) unsupported_rsrc = dict( url=respath_url, referring_url=url, kind='UnsupportedPrathamWebResource', title=title, source_id=source_id, thumbnail_url=thumbnail, children=[], ) page_dict['children'].append(unsupported_rsrc) except Exception as e: LOGGER.error('on_fun_page: %s : %s' % (e, content))
def content_node_from_entry(entry, lang_code): """ Convert a feed entry into ricecooker json dict. """ # METADATA ############################################################################ # author (using ,-separated list in case of multiple authors/contributors) authors_str = _author_from_entry(entry) # license info # currently one of {'African Storybook Initiative', 'USAID'} dcterms_publisher = entry['dcterms_publisher'] license_id = guess_license_id_from_string(entry['dcterms_license']) LICENSE = get_license(license_id, copyright_holder=dcterms_publisher).as_dict() # currently one of {'African Storybook Initiative', 'USAID'} provider = dcterms_publisher # since we're importing the content from here aggregator = 'Global Digital Library' # CONTENT ############################################################################ pdf_link = None epub_link = None thumbnail_url = None for link in entry.links: if link['type'] == 'application/pdf': pdf_link = link elif link['type'] == 'application/epub+zip': epub_link = link elif link['rel'] == _REL_OPDS_IMAGE: thumbnail_url = link['href'] elif link['rel'] == _REL_OPDS_THUMBNAIL: pass # skip thumnail URLs silently --- prefer _REL_OPDS_IMAGE because has right extension else: print('Skipping link', link) pass # prefer EPUBs... if epub_link: epub_url = epub_link['href'] child_node = dict( kind=content_kinds.DOCUMENT, source_id=entry['id'], language=lang_code, title=entry['title'], description=entry.get('summary', None), author=authors_str, license=LICENSE, provider=provider, aggregator=aggregator, thumbnail=thumbnail_url, files=[], ) epub_file = dict( file_type=file_types.EPUB, path=epub_url, language=lang_code, ) child_node['files'] = [epub_file] LOGGER.debug('Created EPUB Document Node from url ' + epub_url) return child_node # ... but if no EPUB, then get PDF. elif epub_link is None and pdf_link: pdf_url = pdf_link['href'] child_node = dict( kind=content_kinds.DOCUMENT, source_id=entry['id'], language=lang_code, title=entry['title'], description=entry.get('summary', None), author=authors_str, license=LICENSE, provider=provider, aggregator=aggregator, thumbnail=thumbnail_url, files=[], ) if dcterms_publisher in BOOK_PUBLISHERS_TO_CROP: # crop African Storybook PDFs pdf_path = crop_pdf_from_url(pdf_url) else: pdf_path = pdf_url # upload unmodified PDF pdf_file = dict( file_type=file_types.DOCUMENT, path=pdf_path, language=lang_code, ) child_node['files'] = [pdf_file] LOGGER.debug('Created PDF Document Node from url ' + pdf_url) return child_node else: print('***** Skipping content, because no supported formats found', entry) return None