def create_document_node(path, title, target_node, source_id, **details): document_file = files.DocumentFile(path) document_id = title.replace(" ", "-").lower() target_node.add_child( nodes.DocumentNode(source_id="{}-{}".format(source_id, document_id), title=title, files=[document_file], **details))
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info topics = load_json_from_file(JSON_FILE) for topic in topics: book_title = topic['book_title'] source_id = book_title.replace(" ", "_") url = topic['path_or_url'] topic_node = nodes.TopicNode(source_id=source_id, title=book_title, tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ]) channel.add_child(topic_node) parser = pdf.PDFParser(url, toc=topic['chapters']) parser.open() chapters = parser.split_chapters() for chapter in chapters: title = chapter['title'] pdf_path = chapter['path'] pdf_file = files.DocumentFile(pdf_path) pdf_node = nodes.DocumentNode( source_id="{} {}".format(book_title, title), title=title, author="INTO", tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ], files=[pdf_file], license=licenses.get_license(CHANNEL_LICENSE, "INTO", LICENSE_DESCRIPTION), copyright_holder="INTO") topic_node.add_child(pdf_node) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def add_file_node(target_node, url, title, **details): """ Creates file node at target topic node """ document_file = files.DocumentFile(path=url) document_id = title.replace(" ", "-").lower() document_node = nodes.DocumentNode(source_id="{}-{}".format( target_node.source_id, document_id), title=title, files=[document_file], **details) target_node.add_child(document_node)
def save_book(book_detail, channel): book_id = book_detail["id"] book_source_id = get_book_source_id(book_id) book_title = book_detail["name"] level_id = book_detail["readingLevel"] language = book_detail["language"] language_id = language["id"] tags = book_detail["tags"] epub_url = book_detail["epubUrl"] pdf_urls = book_detail["pdfUrl"] pdf_portrait_url = pdf_urls.get("portraitUrl", "") if pdf_urls else "" pdf_landscape_url = pdf_urls.get("landscapeUrl", "") if pdf_urls else "" pdf_booklet_url = pdf_urls.get("bookletUrl", "") if pdf_urls else "" pdf_url = pdf_portrait_url or pdf_landscape_url or pdf_booklet_url if not pdf_url and not epub_url: LOGGER.error("No file found for \n {}".format(book_source_id)) raise NoFileAvailableError() book_files = [] if pdf_url: pdf_file = files.DocumentFile(path=pdf_url) book_files.append(pdf_file) if epub_url: epub_file = files.EPubFile(path=epub_url) book_files.append(epub_file) book = nodes.DocumentNode( source_id=book_source_id, title=book_title, license=licenses. PUBLIC_DOMAIN, # TODO: get a real license and copyright holder files=book_files) language_topic = get_or_create_language_topic(language, channel) level_topic = get_or_create_level_topic(level_id, language_id, language_topic) if not tags: level_topic.add_child(book) return for tag in tags: tag_topic = get_or_create_tag_topic(tag, language_id, level_id, level_topic) tag_topic.add_child(book)
def generate_pdf_nodes(data, topic, source=""): """ Generates nodes related to pdfs Args: - data (dict) data on pdf details (split pdfs, file paths, exercises, etc.) - topic (TopicNode) node to add sub nodes to - source (str) unique string associated with this pdf Returns None """ # Iterate through chapter data for chapter in data: # Create topics if we're dealing with a section if chapter.get('header'): source_id = "{}-{}".format(source, chapter['header']) subtopic = nodes.TopicNode(title=chapter['header'], source_id=source_id) topic.add_child(subtopic) generate_pdf_nodes(chapter['chapters'], subtopic, source=source_id) # Create a document node and its related exercise nodes if it's a document elif chapter.get("chapter"): # Create doucment node source_id = "{}-{}".format(source, chapter['chapter']) topic.add_child( nodes.DocumentNode(title=chapter['chapter'], source_id=source_id, copyright_holder=COPYRIGHT_HOLDER, license=LICENSE, files=[files.DocumentFile(chapter['path']) ])) # Create exercise nodes for index, exercise in enumerate(chapter.get("exercises") or []): exercise_id = "{} Exercise {}".format(source_id, index) exercise_node = nodes.ExerciseNode( title=chapter['chapter'], source_id=exercise_id, description=exercise.get('description'), copyright_holder=COPYRIGHT_HOLDER, license=LICENSE, ) topic.add_child(exercise_node) create_exercise_questions(exercise_node, exercise.get('questions') or [])
def create_slideshow(images, source_id, title, language_name): """ images: {url: str, caption: str} """ thumbnailFile = files.ThumbnailFile(images[0]['url']) if '--slides' in sys.argv: slides = [ files.SlideImageFile(image['url'], caption=image.get('caption', '')) for image in images ] return nodes.SlideshowNode(source_id=source_id, title=title, license=LICENSE, language=LANGUAGE_MAP[language_name], files=[thumbnailFile] + slides) # Create PDF filename = hashlib.md5(source_id.encode('utf-8')).hexdigest() pdfpath = '{}{}{}.pdf'.format(DOCUMENT_DOWNLOAD_DIR, os.path.sep, filename) if not os.path.exists(pdfpath): image_list = [] for image in images: img = Image.open(BytesIO(downloader.read(image['url']))) if img.mode == 'RGBA': img = img.convert('RGB') image_list.append(img) image_list[0].save(pdfpath, save_all=True, append_images=image_list[1:]) return nodes.DocumentNode( source_id=source_id, title=title, license=LICENSE, language=LANGUAGE_MAP[language_name], files=[thumbnailFile, files.DocumentFile(pdfpath)])
def make_content_node(kind, source_id, title, license, filepath, optionals): """ Create `kind` subclass of ContentNode based on required args and optionals. """ content_node = None if kind == content_kinds.VIDEO: content_node = nodes.VideoNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), derive_thumbnail=True, # video-specific data files=[files.VideoFile(path=filepath)], ) elif kind == content_kinds.AUDIO: content_node = nodes.AudioNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.AudioFile(path=filepath)], ) elif kind == content_kinds.DOCUMENT: content_node = nodes.DocumentNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.DocumentFile(path=filepath)], ) return content_node
def create_leaf_node(self, module, subject_node, subject_id): # zips are always SCORMs in this case. assert 'file' in module, "Invalid module: {}".format(module) if 'file' in module: ext = os.path.splitext(module['file'])[1].lower() if ext == '.zip': self.get_scorm_topic_tree(subject_node, module['file']) elif ext == '.pdf': license = licenses.SpecialPermissionsLicense(copyright_holder="ProFuturo", description="FIXME: Get license info") doc_id = '{}-{}'.format(subject_id, module['id']) doc_file = files.DocumentFile(path=module['file']) doc_node = nodes.DocumentNode(title=module['title'], source_id=doc_id, files=[doc_file], license=license) subject_node.add_child(doc_node) role = roles.LEARNER if 'role' in module: role = module['role'] def set_role_recursive(node, role): node.role = role for child in node.children: set_role_recursive(child, role) set_role_recursive(subject_node, role)
def _build_tree(node, sourcetree): """ Parse nodes given in `sourcetree` and add as children of `node`. """ for child_source_node in sourcetree: try: main_file = child_source_node['files'][0] if 'files' in child_source_node else {} kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.VIDEO: child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], license=get_license(child_source_node.get("license"), description="Description of license"), author=child_source_node.get("author"), description=child_source_node.get("description"), derive_thumbnail=True, # video-specific data thumbnail=child_source_node.get('thumbnail'), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.AUDIO: child_node = nodes.AudioNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.DOCUMENT: child_node = nodes.DocumentNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.EXERCISE: child_node = nodes.ExerciseNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), exercise_data={}, # Just set to default thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) for q in child_source_node.get("questions"): question = create_question(q) child_node.add_question(question) node.add_child(child_node) elif kind == content_kinds.HTML5: child_node = nodes.HTML5AppNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def build_tree_from_json(parent_node, sourcetree): """ Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ EXPECTED_NODE_TYPES = [ TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE, HTML5_NODE ] for source_node in sourcetree: kind = source_node['kind'] if kind not in EXPECTED_NODE_TYPES: LOGGER.critical('Unexpected node type found: ' + kind) raise NotImplementedError( 'Unexpected node type found in json data.') if kind == TOPIC_NODE: child_node = nodes.TopicNode( source_id=source_node.get("source_id", None), title=source_node["title"], author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) parent_node.add_child(child_node) source_tree_children = source_node.get("children", []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), derive_thumbnail=source_node.get( 'derive_thumbnail', True), # video-specific option thumbnail=source_node.get('thumbnail'), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get('thumbnail'), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), questions=[], ) add_questions(child_node, source_node.get("questions") or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) else: LOGGER.critical("Encountered an unknown kind: " + str(source_node)) continue return parent_node
def build_tree_from_json(parent_node, sourcetree): """ Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE] for source_node in sourcetree: kind = source_node['kind'] if kind not in EXPECTED_NODE_TYPES: LOGGER.critical('Unexpected node kind found: ' + kind) raise NotImplementedError('Unexpected node kind found in json data.') if kind == TOPIC_NODE: child_node = nodes.TopicNode( source_id=source_node.get('source_id', None), title=source_node['title'], description=source_node.get('description'), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), # no role for topics (computed dynaically from descendants) language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) parent_node.add_child(child_node) source_tree_children = source_node.get('children', []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), derive_thumbnail=source_node.get('derive_thumbnail', True), # video-specific option thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), exercise_data=source_node.get('exercise_data'), questions=[], ) add_questions(child_node, source_node.get('questions') or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == SLIDESHOW_NODE: child_node = nodes.SlideshowNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags') ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) else: LOGGER.critical('Encountered an unknown kind: ' + str(source_node)) continue return parent_node
def scrape_page(channel): """ Read main page for Saylor (https://www.saylor.org/books/) """ try: page = BeautifulSoup(read_source(BASE_URL, loadjs=True), 'html.parser') contents = page.find('div', { 'class': 'main-content' }).find('div', {'class', 'row'}) # Site doesn't have special designation for subjects, so get headers for subject in contents.find_all('h3'): # Create subject topic title = subject.text.replace(u'\xa0', u' ').replace('\n', '') source_id = generate_id(title) category_topic = nodes.TopicNode(source_id=source_id, title=title) channel.add_child(category_topic) LOGGER.info(title) # Get list from subject book_list = subject.findNext('ul') for book in book_list.find_all('li'): license = LICENSE page_links = [] # Some books have subsections for different formats/licenses # e.g. See Business-General/Miscellaneous > Information Systems for Business and Beyond if book.find('small'): # Determine what license to use for l in licenses.choices: if l[0] in book.find('small').text: license = l[0] break booktitle = book.contents[0] LOGGER.info(" " + booktitle) # Download one of the sublinks for sublink in book.find_all('a'): if not sublink.get('href'): continue elif "PDF" in sublink.text: category_topic.add_child( nodes.DocumentNode( source_id=source_id + os.path.basename(sublink['href']), title=booktitle, license=license, copyright_holder=COPYRIGHT_HOLDER, files=[ files.DocumentFile( path=sublink['href']) ])) break # only need to download one format of the book elif "HTML" in sublink.text: html_node = scrape_book(sublink['href'], license=license) if html_node: category_topic.add_child(html_node) break # only need to download one format of the book # Most book links go straight to an html page else: page_links.append(book.find('a')['href']) html_node = scrape_book(book.find('a')['href'], license) if html_node: category_topic.add_child(html_node) finally: # No matter what, add link to video mapping for future runs with open(VIDEO_MAP_JSON, "w") as videojson: json.dump(VIDEO_MAPPING, videojson)