예제 #1
0
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
예제 #2
0
def create_document_node(path, title, target_node, source_id, **details):
    document_file = files.DocumentFile(path)
    document_id = title.replace(" ", "-").lower()
    target_node.add_child(
        nodes.DocumentNode(source_id="{}-{}".format(source_id, document_id),
                           title=title,
                           files=[document_file],
                           **details))
예제 #3
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        topics = load_json_from_file(JSON_FILE)
        for topic in topics:
            book_title = topic['book_title']
            source_id = book_title.replace(" ", "_")
            url = topic['path_or_url']
            topic_node = nodes.TopicNode(source_id=source_id,
                                         title=book_title,
                                         tags=[
                                             "Teacher facing",
                                             "Professional development",
                                             "Life skills",
                                             "Intercultural skills",
                                             "Mentorship", "Formal contexts"
                                         ])
            channel.add_child(topic_node)

            parser = pdf.PDFParser(url, toc=topic['chapters'])
            parser.open()
            chapters = parser.split_chapters()
            for chapter in chapters:
                title = chapter['title']
                pdf_path = chapter['path']
                pdf_file = files.DocumentFile(pdf_path)
                pdf_node = nodes.DocumentNode(
                    source_id="{} {}".format(book_title, title),
                    title=title,
                    author="INTO",
                    tags=[
                        "Teacher facing", "Professional development",
                        "Life skills", "Intercultural skills", "Mentorship",
                        "Formal contexts"
                    ],
                    files=[pdf_file],
                    license=licenses.get_license(CHANNEL_LICENSE, "INTO",
                                                 LICENSE_DESCRIPTION),
                    copyright_holder="INTO")
                topic_node.add_child(pdf_node)

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
def add_file_node(target_node, url, title, **details):
    """ Creates file node at target topic node """
    document_file = files.DocumentFile(path=url)
    document_id = title.replace(" ", "-").lower()
    document_node = nodes.DocumentNode(source_id="{}-{}".format(
        target_node.source_id, document_id),
                                       title=title,
                                       files=[document_file],
                                       **details)
    target_node.add_child(document_node)
def save_book(book_detail, channel):
    book_id = book_detail["id"]
    book_source_id = get_book_source_id(book_id)
    book_title = book_detail["name"]
    level_id = book_detail["readingLevel"]
    language = book_detail["language"]
    language_id = language["id"]
    tags = book_detail["tags"]
    epub_url = book_detail["epubUrl"]
    pdf_urls = book_detail["pdfUrl"]
    pdf_portrait_url = pdf_urls.get("portraitUrl", "") if pdf_urls else ""
    pdf_landscape_url = pdf_urls.get("landscapeUrl", "") if pdf_urls else ""
    pdf_booklet_url = pdf_urls.get("bookletUrl", "") if pdf_urls else ""
    pdf_url = pdf_portrait_url or pdf_landscape_url or pdf_booklet_url

    if not pdf_url and not epub_url:
        LOGGER.error("No file found for \n {}".format(book_source_id))
        raise NoFileAvailableError()

    book_files = []
    if pdf_url:
        pdf_file = files.DocumentFile(path=pdf_url)
        book_files.append(pdf_file)
    if epub_url:
        epub_file = files.EPubFile(path=epub_url)
        book_files.append(epub_file)

    book = nodes.DocumentNode(
        source_id=book_source_id,
        title=book_title,
        license=licenses.
        PUBLIC_DOMAIN,  # TODO: get a real license and copyright holder
        files=book_files)

    language_topic = get_or_create_language_topic(language, channel)
    level_topic = get_or_create_level_topic(level_id, language_id,
                                            language_topic)

    if not tags:
        level_topic.add_child(book)
        return

    for tag in tags:
        tag_topic = get_or_create_tag_topic(tag, language_id, level_id,
                                            level_topic)
        tag_topic.add_child(book)
예제 #6
0
def generate_pdf_nodes(data, topic, source=""):
    """
        Generates nodes related to pdfs
        Args:
            - data (dict) data on pdf details (split pdfs, file paths, exercises, etc.)
            - topic (TopicNode) node to add sub nodes to
            - source (str) unique string associated with this pdf
        Returns None
    """

    # Iterate through chapter data
    for chapter in data:
        # Create topics if we're dealing with a section
        if chapter.get('header'):
            source_id = "{}-{}".format(source, chapter['header'])
            subtopic = nodes.TopicNode(title=chapter['header'],
                                       source_id=source_id)
            topic.add_child(subtopic)
            generate_pdf_nodes(chapter['chapters'], subtopic, source=source_id)

        # Create a document node and its related exercise nodes if it's a document
        elif chapter.get("chapter"):
            # Create doucment node
            source_id = "{}-{}".format(source, chapter['chapter'])
            topic.add_child(
                nodes.DocumentNode(title=chapter['chapter'],
                                   source_id=source_id,
                                   copyright_holder=COPYRIGHT_HOLDER,
                                   license=LICENSE,
                                   files=[files.DocumentFile(chapter['path'])
                                          ]))

            # Create exercise nodes
            for index, exercise in enumerate(chapter.get("exercises") or []):
                exercise_id = "{} Exercise {}".format(source_id, index)
                exercise_node = nodes.ExerciseNode(
                    title=chapter['chapter'],
                    source_id=exercise_id,
                    description=exercise.get('description'),
                    copyright_holder=COPYRIGHT_HOLDER,
                    license=LICENSE,
                )
                topic.add_child(exercise_node)
                create_exercise_questions(exercise_node,
                                          exercise.get('questions') or [])
예제 #7
0
def create_slideshow(images, source_id, title, language_name):
    """
        images: {url: str, caption: str}
    """

    thumbnailFile = files.ThumbnailFile(images[0]['url'])

    if '--slides' in sys.argv:
        slides = [
            files.SlideImageFile(image['url'],
                                 caption=image.get('caption', ''))
            for image in images
        ]
        return nodes.SlideshowNode(source_id=source_id,
                                   title=title,
                                   license=LICENSE,
                                   language=LANGUAGE_MAP[language_name],
                                   files=[thumbnailFile] + slides)

    # Create PDF
    filename = hashlib.md5(source_id.encode('utf-8')).hexdigest()
    pdfpath = '{}{}{}.pdf'.format(DOCUMENT_DOWNLOAD_DIR, os.path.sep, filename)

    if not os.path.exists(pdfpath):
        image_list = []
        for image in images:
            img = Image.open(BytesIO(downloader.read(image['url'])))
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            image_list.append(img)

        image_list[0].save(pdfpath,
                           save_all=True,
                           append_images=image_list[1:])

    return nodes.DocumentNode(
        source_id=source_id,
        title=title,
        license=LICENSE,
        language=LANGUAGE_MAP[language_name],
        files=[thumbnailFile, files.DocumentFile(pdfpath)])
def make_content_node(kind, source_id, title, license, filepath, optionals):
    """
    Create `kind` subclass of ContentNode based on required args and optionals.
    """
    content_node = None
    if kind == content_kinds.VIDEO:
        content_node = nodes.VideoNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            derive_thumbnail=True, # video-specific data
            files=[files.VideoFile(path=filepath)],
        )

    elif kind == content_kinds.AUDIO:
        content_node = nodes.AudioNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.AudioFile(path=filepath)],
        )

    elif kind == content_kinds.DOCUMENT:
        content_node = nodes.DocumentNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.DocumentFile(path=filepath)],
        )

    return content_node
예제 #9
0
def add_files(node, file_list):
	for f in file_list:
		file_name, file_type = parse_file_name(f)
		print (f)
		if file_type == 'mp3':#FileTypes.AUDIO_FILE:
			node.add_file(files.AudioFile(path=f))
		#elif file_type == FileTypes.THUMBNAIL:
		#	node.add_file(files.ThumbnailFile(path=f['path']))
		elif file_type == 'pdf':#FileTypes.DOCUMENT_FILE:
			node.add_file(files.DocumentFile(path=f))
		#elif file_type == FileTypes.HTML_ZIP_FILE:
		#	node.add_file(files.HTMLZipFile(path=f['path'], language=f.get('language')))
		elif file_type == 'mp4':#FileTypes.VIDEO_FILE:
			node.add_file(files.VideoFile(path=f))
		#elif file_type == FileTypes.SUBTITLE_FILE:
		#	node.add_file(files.SubtitleFile(path=f['path'], language=f['language']))
		#elif file_type == FileTypes.BASE64_FILE:
		#	node.add_file(files.Base64ImageFile(encoding=f['encoding']))
		#elif file_type == FileTypes.WEB_VIDEO_FILE:
		#	node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
		#elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
		#	node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
		else:
			raise UnknownFileTypeError("Unrecognised file type '{0}'".format(f['path']))
예제 #10
0
    def create_leaf_node(self, module, subject_node, subject_id):
        # zips are always SCORMs in this case.
        assert 'file' in module, "Invalid module: {}".format(module)
        if 'file' in module:
            ext = os.path.splitext(module['file'])[1].lower()
            if ext == '.zip':
                self.get_scorm_topic_tree(subject_node, module['file'])
            elif ext == '.pdf':
                license = licenses.SpecialPermissionsLicense(copyright_holder="ProFuturo",
                                                             description="FIXME: Get license info")
                doc_id = '{}-{}'.format(subject_id, module['id'])
                doc_file = files.DocumentFile(path=module['file'])
                doc_node = nodes.DocumentNode(title=module['title'], source_id=doc_id, files=[doc_file], license=license)
                subject_node.add_child(doc_node)
            role = roles.LEARNER
            if 'role' in module:
                role = module['role']

            def set_role_recursive(node, role):
                node.role = role
                for child in node.children:
                    set_role_recursive(child, role)

            set_role_recursive(subject_node, role)
예제 #11
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
def scrape_page(channel):
    """ Read main page for Saylor (https://www.saylor.org/books/) """
    try:
        page = BeautifulSoup(read_source(BASE_URL, loadjs=True), 'html.parser')
        contents = page.find('div', {
            'class': 'main-content'
        }).find('div', {'class', 'row'})

        # Site doesn't have special designation for subjects, so get headers
        for subject in contents.find_all('h3'):

            # Create subject topic
            title = subject.text.replace(u'\xa0', u' ').replace('\n', '')
            source_id = generate_id(title)
            category_topic = nodes.TopicNode(source_id=source_id, title=title)
            channel.add_child(category_topic)
            LOGGER.info(title)

            # Get list from subject
            book_list = subject.findNext('ul')
            for book in book_list.find_all('li'):
                license = LICENSE
                page_links = []

                # Some books have subsections for different formats/licenses
                # e.g. See Business-General/Miscellaneous > Information Systems for Business and Beyond
                if book.find('small'):
                    # Determine what license to use
                    for l in licenses.choices:
                        if l[0] in book.find('small').text:
                            license = l[0]
                            break
                    booktitle = book.contents[0]
                    LOGGER.info("    " + booktitle)
                    # Download one of the sublinks
                    for sublink in book.find_all('a'):
                        if not sublink.get('href'):
                            continue
                        elif "PDF" in sublink.text:
                            category_topic.add_child(
                                nodes.DocumentNode(
                                    source_id=source_id +
                                    os.path.basename(sublink['href']),
                                    title=booktitle,
                                    license=license,
                                    copyright_holder=COPYRIGHT_HOLDER,
                                    files=[
                                        files.DocumentFile(
                                            path=sublink['href'])
                                    ]))
                            break  # only need to download one format of the book
                        elif "HTML" in sublink.text:
                            html_node = scrape_book(sublink['href'],
                                                    license=license)
                            if html_node:
                                category_topic.add_child(html_node)
                                break  # only need to download one format of the book

                # Most book links go straight to an html page
                else:
                    page_links.append(book.find('a')['href'])
                    html_node = scrape_book(book.find('a')['href'], license)
                    if html_node:
                        category_topic.add_child(html_node)
    finally:
        # No matter what, add link to video mapping for future runs
        with open(VIDEO_MAP_JSON, "w") as videojson:
            json.dump(VIDEO_MAPPING, videojson)